diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..6cc1bb3f4755ad4e65bca43ea68e41e1e2313b71 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+test_samples/open_door.jpg filter=lfs diff=lfs merge=lfs -text
+test_samples/oxford.jpeg filter=lfs diff=lfs merge=lfs -text
+test_samples/changi.jpg filter=lfs diff=lfs merge=lfs -text
+test_samples/friends.jpg filter=lfs diff=lfs merge=lfs -text
+test_samples/jesus.jpg filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..256e737fbe4078a79d1bb088d7d9376784fa706a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+assets/*
+pycache/*
+__pycache__/*
+.DS_Store
diff --git a/README.md b/README.md
index 1f7fb4cb3b7240e31c988632350378845d71b433..08e1ef703c761ca7969ec832e492559b5cad6110 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,76 @@
----
-title: Vmem
-emoji: 👁
-colorFrom: yellow
-colorTo: gray
-sdk: gradio
-sdk_version: 5.33.2
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+<div align="center">
+<h1>VMem: Consistent Video Scene Generation with Surfel-Indexed View Memory</h1>
+
+<a href="https://v-mem.github.io/"><img src="https://img.shields.io/badge/%F0%9F%8F%A0%20Project%20Page-gray.svg"></a>
+<a href="http://arxiv.org/abs/2503.14489"><img src="https://img.shields.io/badge/%F0%9F%93%84%20arXiv-2503.14489-B31B1B.svg"></a>
+<a href="https://huggingface.co/liguang0115/vmem"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Model_Card-Huggingface-orange"></a>
+<a href="https://huggingface.co/spaces/stabilityai/stable-virtual-camera"><img src="https://img.shields.io/badge/%F0%9F%9A%80%20Gradio%20Demo-Huggingface-orange"></a>
+
+[Runjia Li](https://runjiali-rl.github.io/), [Philip Torr](https://www.robots.ox.ac.uk/~phst/), [Andrea Vedaldi](https://www.robots.ox.ac.uk/~vedaldi/), [Tomas Jakab](https://www.robots.ox.ac.uk/~tomj/)
+<br>
+<br>
+[University of Oxford](https://www.robots.ox.ac.uk/~vgg/)
+</div>
+
+<p align="center">
+  <img src="assets/demo_teaser.gif" width="100%" alt="Teaser" style="border-radius:10px;"/>
+</p>
+
+<!-- <p align="center" border-radius="10px">
+  <img src="assets/benchmark.png" width="100%" alt="teaser_page1"/>
+</p> -->
+
+# Overview
+
+`VMem` is a plug-and-play memory mechanism of image-set models for consistent scene generation.
+Existing methods either rely on inpainting with explicit geometry estimation, which suffers from inaccuracies, or use limited context windows in video-based approaches, leading to poor long-term coherence. To overcome these issues, we introduce Surfel Memory of Views (VMem), which anchors past views to surface elements (surfels) they observed. This enables conditioning novel view generation on the most relevant past views rather than just the most recent ones, enhancing long-term scene consistency while reducing computational cost.
+
+
+# :wrench: Installation
+
+```bash
+conda create -n vmem python=3.10
+conda activate vmem
+pip install -r requirements.txt
+```
+
+
+# :rocket: Usage
+
+You need to properly authenticate with Hugging Face to download our model weights. Once set up, our code will handle it automatically at your first run. You can authenticate by running
+
+```bash
+# This will prompt you to enter your Hugging Face credentials.
+huggingface-cli login
+```
+
+Once authenticated, go to our model card [here](https://huggingface.co/stabilityai/stable-virtual-camera) and enter your information for access.
+
+We provide a demo for you to interact with `VMem`. Simply run
+
+```bash
+python app.py
+```
+
+
+## :heart: Acknowledgement
+This work is built on top of [CUT3R](https://github.com/CUT3R/CUT3R), [DUSt3R](https://github.com/naver/dust3r) and [Stable Virtual Camera](https://github.com/stability-ai/stable-virtual-camera). We thank them for their great works.
+
+
+
+
+
+# :books: Citing
+
+If you find this repository useful, please consider giving a star :star: and citation.
+
+```
+@article{zhou2025stable,
+    title={Stable Virtual Camera: Generative View Synthesis with Diffusion Models},
+    author={Jensen (Jinghao) Zhou and Hang Gao and Vikram Voleti and Aaryaman Vasishta and Chun-Han Yao and Mark Boss and
+    Philip Torr and Christian Rupprecht and Varun Jampani
+    },
+    journal={arXiv preprint arXiv:2503.14489},
+    year={2025}
+}
+```
\ No newline at end of file
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..9006db68c99ca47304f71c956b9ee9eecc5329d6
--- /dev/null
+++ b/app.py
@@ -0,0 +1,933 @@
+from typing import List, Literal
+from pathlib import Path
+from functools import partial
+import spaces
+import gradio as gr
+import numpy as np
+import torch
+from torchvision.datasets.utils import download_and_extract_archive
+from einops import repeat
+from omegaconf import OmegaConf
+from modeling.pipeline import VMemPipeline
+from diffusers.utils import export_to_video, export_to_gif
+from scipy.spatial.transform import Rotation, Slerp
+from navigation import Navigator
+from PIL import Image
+from utils import tensor_to_pil, encode_vae_image, encode_image, get_default_intrinsics, load_img_and_K, transform_img_and_K
+import os
+import glob
+
+
+CONFIG_PATH = "configs/inference/inference.yaml"
+CONFIG = OmegaConf.load(CONFIG_PATH)
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL = VMemPipeline(CONFIG, DEVICE)
+NAVIGATORS = []
+
+
+NAVIGATION_FPS = 3
+WIDTH = 576
+HEIGHT = 576
+
+
+IMAGE_PATHS = ['test_samples/changi.jpg', 'test_samples/oxford.jpeg', 'test_samples/open_door.jpg', 'test_samples/jesus.jpg', 'test_samples/friends.jpg']
+
+# for asset_dir in ASSET_DIRS:
+#     if os.path.exists(asset_dir):
+#         for ext in ["*.jpg", "*.jpeg", "*.png"]:
+#             IMAGE_PATHS.extend(glob.glob(os.path.join(asset_dir, ext)))
+
+# If no images found, create placeholders
+if not IMAGE_PATHS:
+    def create_placeholder_images(num_samples=5, height=HEIGHT, width=WIDTH):
+        """Create placeholder images for the demo"""
+        images = []
+        for i in range(num_samples):
+            # Create a gradient image as placeholder
+            img = np.zeros((height, width, 3), dtype=np.uint8)
+            for h in range(height):
+                for w in range(width):
+                    img[h, w, 0] = int(255 * h / height)  # Red gradient
+                    img[h, w, 1] = int(255 * w / width)   # Green gradient
+                    img[h, w, 2] = int(255 * (i+1) / num_samples)  # Blue varies by image
+            images.append(img)
+        return images
+
+    # Create placeholder video frames and poses
+    def create_placeholder_video_and_poses(num_samples=5, num_frames=1, height=HEIGHT, width=WIDTH):
+        """Create placeholder videos and poses for the demo"""
+        videos = []
+        poses = []
+        
+        for i in range(num_samples):
+            # Create a simple video (just one frame initially for each sample)
+            frames = []
+            for j in range(num_frames):
+                # Create a gradient frame
+                img = np.zeros((height, width, 3), dtype=np.uint8)
+                for h in range(height):
+                    for w in range(width):
+                        img[h, w, 0] = int(255 * h / height)  # Red gradient
+                        img[h, w, 1] = int(255 * w / width)   # Green gradient
+                        img[h, w, 2] = int(255 * (i+1) / num_samples)  # Blue varies by video
+                
+                # Convert to torch tensor [C, H, W] with normalized values
+                frame = torch.from_numpy(img.transpose(2, 0, 1)).float() / 255.0
+                frames.append(frame)
+            
+            video = torch.stack(frames)
+            videos.append(video)
+            
+            # Create placeholder poses (identity matrices flattened)
+            # This creates a 4x4 identity matrix flattened to match expected format
+            # pose = torch.eye(4).flatten()[:-4]  # Remove last row of 4x4 matrix
+            poses.append(torch.eye(4).unsqueeze(0).repeat(num_frames, 1, 1))
+        
+        return videos, poses
+
+    first_frame_list = create_placeholder_images(num_samples=5)
+    video_list, poses_list = create_placeholder_video_and_poses(num_samples=5)
+
+# Function to load image from path
+def load_image_for_navigation(image_path):
+    """Load image from path and prepare for navigation"""
+    # Load image and get default intrinsics
+    image, _ = load_img_and_K(image_path, None, K=None, device=DEVICE)
+    
+    # Transform image to the target size
+    config = OmegaConf.load(CONFIG_PATH)
+    image, _ = transform_img_and_K(image, (config.model.height, config.model.width), mode="crop", K=None)
+    
+    # Create initial video with single frame and pose
+    video = image
+    pose = torch.eye(4).unsqueeze(0)  # [1, 4, 4]
+    
+    return {
+        "image": tensor_to_pil(image),
+        "video": video,
+        "pose": pose
+    }
+
+
+class CustomProgressBar:
+    def __init__(self, pbar):
+        self.pbar = pbar
+
+    def set_postfix(self, **kwargs):
+        pass
+
+    def __getattr__(self, attr):
+        return getattr(self.pbar, attr)
+
+def get_duration_navigate_video(video: torch.Tensor,
+    poses: torch.Tensor,
+    x_angle: float,
+    y_angle: float,
+    distance: float
+):
+    # Estimate processing time based on navigation complexity and number of frames
+    base_duration = 15  # Base duration in seconds
+    
+    # Add time for more complex navigation operations
+    if abs(x_angle) > 20 or abs(y_angle) > 30:
+        base_duration += 10  # More time for sharp turns
+    
+    if distance > 100:
+        base_duration += 10  # More time for longer distances
+    
+    # Add time proportional to existing video length (more frames = more processing)
+    base_duration += min(10, len(video))
+    
+    return base_duration
+
+@spaces.GPU(duration=get_duration_navigate_video)
+@torch.autocast("cuda")
+@torch.no_grad()
+def navigate_video(
+    video: torch.Tensor,
+    poses: torch.Tensor,
+    x_angle: float,
+    y_angle: float,
+    distance: float,
+):
+    """
+    Generate new video frames by navigating in the 3D scene.
+    This function uses the Navigator class from navigation.py to handle movement:
+    - y_angle parameter controls left/right turning (turn_left/turn_right methods)
+    - distance parameter controls forward movement (move_forward method)
+    - x_angle parameter controls vertical angle (not directly implemented in Navigator)
+    
+    Each Navigator instance is stored based on the video session to maintain state.
+    """
+    try:
+        # Convert first frame to PIL Image for navigator
+        initial_frame = tensor_to_pil(video[0])
+        
+        # Initialize the navigator for this session if not already done
+        if len(NAVIGATORS) == 0:
+            # Create a new navigator instance
+            NAVIGATORS.append(Navigator(MODEL, step_size=0.1, num_interpolation_frames=4))
+            
+            # Get the initial pose and convert to numpy
+            initial_pose = poses[0].cpu().numpy().reshape(4, 4)
+            
+            # Default camera intrinsics if not available
+            initial_K = np.array(get_default_intrinsics()[0])
+            
+            # Initialize the navigator
+            NAVIGATORS[0].initialize(initial_frame, initial_pose, initial_K)
+
+        navigator = NAVIGATORS[0]
+        
+        # Generate new frames based on navigation commands
+        new_frames = []
+        
+        # First handle any x-angle (vertical angle) adjustments
+        # Note: This is approximated as Navigator doesn't directly support this
+        if abs(x_angle) > 0:
+            # Implementation for x-angle could be added here
+            # For now, we'll skip this as it's not directly supported
+            pass
+        
+        # Next handle y-angle (turning left/right)
+        if abs(y_angle) > 0:
+            # Use Navigator's turn methods
+            if y_angle > 0:
+                new_frames = navigator.turn_left(abs(y_angle//2))
+            else:
+                new_frames = navigator.turn_right(abs(y_angle//2))
+        # Finally handle distance (moving forward)
+        elif distance > 0:
+            # Calculate number of steps based on distance
+            steps = max(1, int(distance / 10))
+            new_frames = navigator.move_forward(steps)
+        elif distance < 0:
+            # Handle moving backward if needed
+            steps = max(1, int(abs(distance) / 10))
+            new_frames = navigator.move_backward(steps)
+        
+        if not new_frames:
+            # If no new frames were generated, return the current state
+            return video, poses, tensor_to_pil(video[-1]), export_to_video([tensor_to_pil(video[i]) for i in range(len(video))], fps=NAVIGATION_FPS), [(tensor_to_pil(video[i]), f"t={i}") for i in range(len(video))]
+        
+        # Convert PIL images to tensors
+        new_frame_tensors = []
+        for frame in new_frames:
+            # Convert PIL Image to tensor [C, H, W]
+            frame_np = np.array(frame) / 255.0
+            # Convert to [-1, 1] range to match the expected format
+            frame_tensor = torch.from_numpy(frame_np.transpose(2, 0, 1)).float() * 2.0 - 1.0
+            new_frame_tensors.append(frame_tensor)
+        
+        new_frames_tensor = torch.stack(new_frame_tensors)
+        
+        # Get the updated camera poses from the navigator
+        current_pose = navigator.current_pose
+        new_poses = torch.from_numpy(current_pose).float().unsqueeze(0).repeat(len(new_frames), 1, 1)
+        
+        # Reshape the poses to match the expected format
+        new_poses = new_poses.view(len(new_frames), 4, 4)
+        
+        # Concatenate new frames and poses with existing ones
+        updated_video = torch.cat([video.cpu(), new_frames_tensor], dim=0)
+        updated_poses = torch.cat([poses.cpu(), new_poses], dim=0)
+        
+        # Create output images for gallery
+        all_images = [(tensor_to_pil(updated_video[i]), f"t={i}") for i in range(len(updated_video))]
+        updated_video_pil = [tensor_to_pil(updated_video[i]) for i in range(len(updated_video))]
+        
+        return (
+            updated_video,
+            updated_poses,
+            tensor_to_pil(updated_video[-1]),  # Current view
+            export_to_video(updated_video_pil, fps=NAVIGATION_FPS),  # Video
+            all_images,  # Gallery
+        )
+    except Exception as e:
+        print(f"Error in navigate_video: {e}")
+        gr.Warning(f"Navigation error: {e}")
+        # Return the original inputs to avoid crashes
+        current_frame = tensor_to_pil(video[-1]) if len(video) > 0 else None
+        all_frames = [(tensor_to_pil(video[i]), f"t={i}") for i in range(len(video))]
+        video_frames = [tensor_to_pil(video[i]) for i in range(len(video))]
+        video_output = export_to_video(video_frames, fps=NAVIGATION_FPS) if video_frames else None
+        return video, poses, current_frame, video_output, all_frames
+
+
+def undo_navigation(
+    video: torch.Tensor,
+    poses: torch.Tensor,
+):
+    """
+    Undo the last navigation step by removing the last set of frames.
+    Uses the Navigator's undo method which in turn uses the pipeline's undo_latest_move
+    to properly handle surfels and state management.
+    """
+    if len(NAVIGATORS) > 0:
+        navigator = NAVIGATORS[0]
+        
+        # Call the Navigator's undo method to handle the operation
+        success = navigator.undo()
+        
+        if success:
+            # Since the navigator has handled the frame removal internally,
+            # we need to update our video and poses tensors to match
+            updated_video = video[:len(navigator.frames)]
+            updated_poses = poses[:len(navigator.frames)]
+            
+            # Create gallery images
+            all_images = [(tensor_to_pil(updated_video[i]), f"t={i}") for i in range(len(updated_video))]
+            
+            return (
+                updated_video,
+                updated_poses,
+                tensor_to_pil(updated_video[-1]),
+                export_to_video([tensor_to_pil(updated_video[i]) for i in range(len(updated_video))], fps=NAVIGATION_FPS),
+                all_images,
+            )
+        else:
+            gr.Warning("You have no moves left to undo!")
+    else:
+        gr.Warning("No navigation session available!")
+    
+    # If undo wasn't successful or no navigator exists, return original state
+    all_images = [(tensor_to_pil(video[i]), f"t={i}") for i in range(len(video))]
+    
+    return (
+        video,
+        poses,
+        tensor_to_pil(video[-1]),
+        export_to_video([tensor_to_pil(video[i]) for i in range(len(video))], fps=NAVIGATION_FPS),
+        all_images,
+    )
+
+
+
+
+
+def render_demo3(
+    s: Literal["Selection", "Generation"],
+    idx: int,
+    demo3_stage: gr.State,
+    demo3_selected_index: gr.State,
+    demo3_current_video: gr.State,
+    demo3_current_poses: gr.State
+):
+    gr.Markdown(
+        """
+        ## Single Image → Consistent Scene Navigation
+        > #### _Select an image and navigate through the scene by controlling camera movements._
+    """,
+    elem_classes=["task-title"]
+    )
+    match s:
+        case "Selection":
+            with gr.Group():
+                # Add upload functionality
+                with gr.Group(elem_classes=["gradio-box"]):
+                    gr.Markdown("### Upload Your Own Image")
+                    gr.Markdown("_Upload an image to navigate through its 3D scene_")
+                    with gr.Row():
+                        with gr.Column(scale=3):
+                            upload_image = gr.Image(
+                                label="Upload an image",
+                                type="filepath",
+                                height=300,
+                                elem_id="upload-image"
+                            )
+                        with gr.Column(scale=1):
+                            gr.Markdown("#### Instructions:")
+                            gr.Markdown("1. Upload a clear, high-quality image")
+                            gr.Markdown("2. Images with distinct visual features work best")
+                            gr.Markdown("3. Landscape or architectural scenes are ideal")
+                            upload_btn = gr.Button("Start Navigation", variant="primary", size="lg")
+                    
+                    def process_uploaded_image(image_path):
+                        if image_path is None:
+                            gr.Warning("Please upload an image first")
+                            return "Selection", None, None, None
+                        try:
+                            # Load image and prepare for navigation
+                            result = load_image_for_navigation(image_path)
+                            
+                            # Clear any existing navigators
+                            global NAVIGATORS
+                            NAVIGATORS = []
+                            
+                            return (
+                                "Generation",
+                                None,  # No predefined index for uploaded images
+                                result["video"],
+                                result["pose"],
+                            )
+                        except Exception as e:
+                            print(f"Error in process_uploaded_image: {e}")
+                            gr.Warning(f"Error processing uploaded image: {e}")
+                            return "Selection", None, None, None
+                    
+                    upload_btn.click(
+                        fn=process_uploaded_image,
+                        inputs=[upload_image],
+                        outputs=[demo3_stage, demo3_selected_index, demo3_current_video, demo3_current_poses]
+                    )
+                
+                gr.Markdown("### Or Choose From Our Examples")
+                # Define image captions
+                image_captions = {
+                    'test_samples/changi.jpg': 'Changi Airport',
+                    'test_samples/oxford.jpeg': 'Oxford University',
+                    'test_samples/open_door.jpg': 'Bedroom Interior',
+                    'test_samples/jesus.jpg': 'Jesus College',
+                    'test_samples/friends.jpg': 'Friends Café'
+                }
+                
+                # Load all images for the gallery with captions
+                gallery_images = []
+                for img_path in IMAGE_PATHS:
+                    try:
+                        # Get caption or default to basename
+                        caption = image_captions.get(img_path, os.path.basename(img_path))
+                        gallery_images.append((img_path, caption))
+                    except Exception as e:
+                        print(f"Error loading image {img_path}: {e}")
+                
+                # Show image gallery for selection
+                demo3_image_gallery = gr.Gallery(
+                    value=gallery_images,
+                    label="Select an Image to Start Navigation",
+                    columns=len(gallery_images),
+                    height=400,
+                    allow_preview=True,
+                    preview=False,
+                    elem_id="navigation-gallery"
+                )
+                
+                gr.Markdown("_Click on an image to begin navigation_")
+                
+                def start_navigation(evt: gr.SelectData):
+                    try:
+                        # Get the selected image path
+                        selected_path = IMAGE_PATHS[evt.index]
+                        
+                        # Load image and prepare for navigation
+                        result = load_image_for_navigation(selected_path)
+                        
+                        # Clear any existing navigators
+                        global NAVIGATORS
+                        NAVIGATORS = []
+                        
+                        return (
+                            "Generation",
+                            evt.index,
+                            result["video"],
+                            result["pose"],
+                        )
+                    except Exception as e:
+                        print(f"Error in start_navigation: {e}")
+                        gr.Warning(f"Error starting navigation: {e}")
+                        return "Selection", None, None, None
+                
+                demo3_image_gallery.select(
+                    fn=start_navigation,
+                    inputs=None, 
+                    outputs=[demo3_stage, demo3_selected_index, demo3_current_video, demo3_current_poses]
+                )
+
+        case "Generation":
+            with gr.Row():
+                with gr.Column(scale=3):
+                    with gr.Row():
+                        demo3_current_view = gr.Image(
+                            label="Current View",
+                            width=256,
+                            height=256,
+                        )
+                        demo3_video = gr.Video(
+                            label="Generated Video",
+                            width=256,
+                            height=256,
+                            autoplay=True,
+                            loop=True,
+                            show_share_button=True,
+                            show_download_button=True,
+                        )
+
+                    demo3_generated_gallery = gr.Gallery(
+                        value=[],
+                        label="Generated Frames",
+                        columns=[6],
+                    )
+                    
+                    # Initialize the current view with the selected image if available
+                    if idx is not None:
+                        try:
+                            selected_path = IMAGE_PATHS[idx]
+                            result = load_image_for_navigation(selected_path)
+                            demo3_current_view.value = result["image"]
+                        except Exception as e:
+                            print(f"Error initializing current view: {e}")
+
+                with gr.Column():
+                    gr.Markdown("### Navigation Controls ↓")
+                    with gr.Accordion("Instructions", open=False):
+                        gr.Markdown("""
+                            - **The model will predict the next few frames based on your camera movements. Repeat the process to continue navigating through the scene.**
+                            - **Use the navigation controls to move forward/backward and turn left/right.**
+                            - **At the end of your navigation, you can save your camera path for later use.**
+                           
+                        """)
+                    # with gr.Tab("Basic", elem_id="basic-controls-tab"):
+                    with gr.Group():
+                        gr.Markdown("_**Select a direction to move:**_")
+                        # First row: Turn left/right
+                        with gr.Row(elem_id="basic-controls"):
+                            gr.Button(
+                                "↰20°\nVeer",
+                                size="sm",
+                                min_width=0,
+                                variant="primary",
+                            ).click(
+                                fn=partial(
+                                    navigate_video,
+                                    x_angle=0,
+                                    y_angle=20,
+                                    distance=0,
+                                ),
+                                inputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                ],
+                                outputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                    demo3_current_view,
+                                    demo3_video,
+                                    demo3_generated_gallery,
+                                ],
+                            )
+
+                            gr.Button(
+                                "↖10°\nTurn",
+                                size="sm",
+                                min_width=0,
+                                variant="primary",
+                            ).click(
+                                fn=partial(
+                                    navigate_video,
+                                    x_angle=0,
+                                    y_angle=10,
+                                    distance=0,
+                                ),
+                                inputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                ],
+                                outputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                    demo3_current_view,
+                                    demo3_video,
+                                    demo3_generated_gallery,
+                                ],
+                            )
+
+                            # gr.Button(
+                            #     "↑0°\nAhead",
+                            #     size="sm",
+                            #     min_width=0,
+                            #     variant="primary",
+                            # ).click(
+                            #     fn=partial(
+                            #         navigate_video,
+                            #         x_angle=0,
+                            #         y_angle=0,
+                            #         distance=10,
+                            #     ),
+                            #     inputs=[
+                            #         demo3_current_video,
+                            #         demo3_current_poses,
+                            #     ],
+                            #     outputs=[
+                            #         demo3_current_video,
+                            #         demo3_current_poses,
+                            #         demo3_current_view,
+                            #         demo3_video,
+                            #         demo3_generated_gallery,
+                            #     ],
+                            # )
+                            gr.Button(
+                                "↗10°\nTurn",
+                                size="sm",
+                                min_width=0,
+                                variant="primary",
+                            ).click(
+                                fn=partial(
+                                    navigate_video,
+                                    x_angle=0,
+                                    y_angle=-10,
+                                    distance=0,
+                                ),
+                                inputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                ],
+                                outputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                    demo3_current_view,
+                                    demo3_video,
+                                    demo3_generated_gallery,
+                                ],
+                            )
+                            gr.Button(
+                                "↱\n20° Veer",
+                                size="sm",
+                                min_width=0,
+                                variant="primary",
+                            ).click(
+                                fn=partial(
+                                    navigate_video,
+                                    x_angle=0,
+                                    y_angle=-20,
+                                    distance=0,
+                                ),
+                                inputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                ],
+                                outputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                    demo3_current_view,
+                                    demo3_video,
+                                    demo3_generated_gallery,
+                                ],
+                            )
+                        
+                        # Second row: Forward/Backward movement
+                        with gr.Row(elem_id="forward-backward-controls"):
+                            gr.Button(
+                                "↓\nBackward",
+                                size="sm",
+                                min_width=0,
+                                variant="secondary",
+                            ).click(
+                                fn=partial(
+                                    navigate_video,
+                                    x_angle=0,
+                                    y_angle=0,
+                                    distance=-10,
+                                ),
+                                inputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                ],
+                                outputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                    demo3_current_view,
+                                    demo3_video,
+                                    demo3_generated_gallery,
+                                ],
+                            )
+                            
+                            gr.Button(
+                                "↑\nForward",
+                                size="sm",
+                                min_width=0,
+                                variant="secondary",
+                            ).click(
+                                fn=partial(
+                                    navigate_video,
+                                    x_angle=0,
+                                    y_angle=0,
+                                    distance=10,
+                                ),
+                                inputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                ],
+                                outputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                    demo3_current_view,
+                                    demo3_video,
+                                    demo3_generated_gallery,
+                                ],
+                            )
+                    # with gr.Tab("Advanced", elem_id="advanced-controls-tab"):
+                    #     with gr.Group():
+                    #         gr.Markdown("_**Select angles and distance:**_")
+
+                    #         demo3_y_angle = gr.Slider(
+                    #             minimum=-90,
+                    #             maximum=90,
+                    #             value=0,
+                    #             step=10,
+                    #             label="Horizontal Angle",
+                    #             interactive=True,
+                    #         )
+                    #         demo3_x_angle = gr.Slider(
+                    #             minimum=-40,
+                    #             maximum=40,
+                    #             value=0,
+                    #             step=10,
+                    #             label="Vertical Angle",
+                    #             interactive=True,
+                    #         )
+                    #         demo3_distance = gr.Slider(
+                    #             minimum=-200,
+                    #             maximum=200,
+                    #             value=100,
+                    #             step=10,
+                    #             label="Distance (negative = backward)",
+                    #             interactive=True,
+                    #         )
+
+                    #         gr.Button(
+                    #             "Generate Next Move", variant="primary"
+                    #         ).click(
+                    #             fn=navigate_video,
+                    #             inputs=[
+                    #                 demo3_current_video,
+                    #                 demo3_current_poses,
+                    #                 demo3_x_angle,
+                    #                 demo3_y_angle,
+                    #                 demo3_distance,
+                    #             ],
+                    #             outputs=[
+                    #                 demo3_current_video,
+                    #                 demo3_current_poses,
+                    #                 demo3_current_view,
+                    #                 demo3_video,
+                    #                 demo3_generated_gallery,
+                    #             ],
+                    #         )
+                    gr.Markdown("---")
+                    with gr.Group():
+                        gr.Markdown("_**Navigation controls:**_")
+                        with gr.Row():
+                            gr.Button("Undo Last Move", variant="huggingface").click(
+                                fn=undo_navigation,
+                                inputs=[demo3_current_video, demo3_current_poses],
+                                outputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                    demo3_current_view,
+                                    demo3_video,
+                                    demo3_generated_gallery,
+                                ],
+                            )
+                            
+                            # Add a function to save camera poses
+                            def save_camera_poses(video, poses):
+                                if len(NAVIGATORS) > 0:
+                                    navigator = NAVIGATORS[0]
+                                    # Create a directory for saved poses
+                                    os.makedirs("./visualization", exist_ok=True)
+                                    save_path = f"./visualization/transforms_{len(navigator.frames)}_frames.json"
+                                    navigator.save_camera_poses(save_path)
+                                    return gr.Info(f"Camera poses saved to {save_path}")
+                                return gr.Warning("No navigation instance found")
+                            
+                            gr.Button("Save Camera", variant="huggingface").click(
+                                fn=save_camera_poses,
+                                inputs=[demo3_current_video, demo3_current_poses],
+                                outputs=[]
+                            )
+                            
+                            # Add a button to return to image selection
+                            def reset_navigation():
+                                # Clear current navigator
+                                global NAVIGATORS
+                                NAVIGATORS = []
+                                return "Selection", None, None, None
+                            
+                            gr.Button("Choose New Image", variant="secondary").click(
+                                fn=reset_navigation,
+                                inputs=[],
+                                outputs=[demo3_stage, demo3_selected_index, demo3_current_video, demo3_current_poses]
+                            )
+
+
+# Create the Gradio Blocks
+with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
+    gr.HTML(
+        """
+    <style>
+    [data-tab-id="task-1"], [data-tab-id="task-2"], [data-tab-id="task-3"] {
+        font-size: 16px !important;
+        font-weight: bold;
+    }
+    #page-title h1 {
+        color: #0D9488 !important;
+    }
+    .task-title h2 {
+        color: #F59E0C !important;
+    }
+    .header-button-row {
+        gap: 4px !important;
+    }
+    .header-button-row div {
+        width: 131.0px !important;
+    }
+    .header-button-column {
+        width: 131.0px !important;
+        gap: 5px !important;
+    }
+    .header-button a {
+        border: 1px solid #e4e4e7;
+    }
+    .header-button .button-icon {
+        margin-right: 8px;
+    }
+    .demo-button-column .gap {
+        gap: 5px !important;
+    }
+    #basic-controls {
+        column-gap: 0px;
+    }
+    #basic-controls-tab {
+        padding: 0px;
+    }
+    #advanced-controls-tab {
+        padding: 0px;
+    }
+    #forward-backward-controls {
+        column-gap: 0px;
+        justify-content: center;
+        margin-top: 8px;
+    }
+    #selected-demo-button {
+        color: #F59E0C;
+        text-decoration: underline;
+    }
+    .demo-button {
+        text-align: left !important;
+        display: block !important;
+    }
+    #navigation-gallery {
+        margin-bottom: 15px;
+    }
+    #navigation-gallery .gallery-item {
+        cursor: pointer;
+        border-radius: 6px;
+        transition: transform 0.2s, box-shadow 0.2s;
+    }
+    #navigation-gallery .gallery-item:hover {
+        transform: scale(1.02);
+        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+    }
+    #navigation-gallery .gallery-item.selected {
+        border: 3px solid #0D9488;
+    }
+    /* Upload image styling */
+    #upload-image {
+        border-radius: 8px;
+        border: 2px dashed #0D9488;
+        padding: 10px;
+        transition: all 0.3s ease;
+    }
+    #upload-image:hover {
+        border-color: #F59E0C;
+        box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
+    }
+    /* Box styling */
+    .gradio-box {
+        border-radius: 10px;
+        margin-bottom: 20px;
+        padding: 15px;
+        background-color: #f8f9fa;
+        border: 1px solid #e9ecef;
+    }
+    </style>
+    """
+    )
+
+    demo_idx = gr.State(value=3)
+
+    with gr.Sidebar():
+        gr.Markdown("# VMem: Consistent Scene Generation with Surfel Memory of Views", elem_id="page-title")
+        gr.Markdown(
+            "### Official Interactive Demo for [_VMem_](https://arxiv.org/abs/2502.06764)"
+        )
+        gr.Markdown("---")
+        gr.Markdown("#### Links ↓")
+        with gr.Row(elem_classes=["header-button-row"]):
+            with gr.Column(elem_classes=["header-button-column"], min_width=0):
+                gr.Button(
+                    value="Website",
+                    link="https://v-mem.github.io/",
+                    icon="https://simpleicons.org/icons/googlechrome.svg",
+                    elem_classes=["header-button"],
+                    size="md",
+                    min_width=0,
+                )
+                gr.Button(
+                    value="Paper",
+                    link="https://arxiv.org/abs/2502.06764",
+                    icon="https://simpleicons.org/icons/arxiv.svg",
+                    elem_classes=["header-button"],
+                    size="md",
+                    min_width=0,
+                )
+            with gr.Column(elem_classes=["header-button-column"], min_width=0):
+                gr.Button(
+                    value="Code",
+                    link="https://github.com/kwsong0113/diffusion-forcing-transformer",
+                    icon="https://simpleicons.org/icons/github.svg",
+                    elem_classes=["header-button"],
+                    size="md",
+                    min_width=0,
+                )
+                gr.Button(
+                    value="Weights",
+                    link="https://huggingface.co/liguang0115/vmem",
+                    icon="https://simpleicons.org/icons/huggingface.svg",
+                    elem_classes=["header-button"],
+                    size="md",
+                    min_width=0,
+                )
+        gr.Markdown("---")
+        gr.Markdown("#### Choose a Demo ↓")
+        with gr.Column(elem_classes=["demo-button-column"]):
+            @gr.render(inputs=[demo_idx])
+            def render_demo_tabs(idx):
+                demo_tab_button3 = gr.Button(
+                    "Navigate Image",
+                    size="md", elem_classes=["demo-button"],  **{"elem_id": "selected-demo-button"} if idx == 3 else {}
+                ).click(
+                    fn=lambda: 3,
+                    outputs=demo_idx
+                )
+        gr.Markdown("---")
+        gr.Markdown("#### Troubleshooting ↓")
+        with gr.Group():
+            with gr.Accordion("Error or Unexpected Results?", open=False):
+                gr.Markdown("Please try again after refreshing the page and ensure you do not click the same button multiple times.")
+            with gr.Accordion("Too Slow or No GPU Allocation?", open=False):
+                gr.Markdown(
+                    "Consider running the demo locally (click the dots in the top-right corner). Alternatively, you can subscribe to Hugging Face Pro for an increased GPU quota."
+                )
+
+
+    demo3_stage = gr.State(value="Selection")
+    demo3_selected_index = gr.State(value=None)
+    demo3_current_video = gr.State(value=None)
+    demo3_current_poses = gr.State(value=None)
+
+    @gr.render(inputs=[demo_idx, demo3_stage, demo3_selected_index])
+    def render_demo(
+        _demo_idx, _demo3_stage, _demo3_selected_index
+    ):
+        match _demo_idx:
+            case 3:
+                render_demo3(_demo3_stage, _demo3_selected_index, demo3_stage, demo3_selected_index, demo3_current_video, demo3_current_poses)
+                
+
+if __name__ == "__main__":
+    demo.launch(debug=True,
+                share=True,
+                max_threads=1,  # Limit concurrent processing
+                show_error=True,  # Show detailed error messages
+                )
diff --git a/configs/inference/inference.yaml b/configs/inference/inference.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4e603e2f341832779730789ff0310f14501ab3ac
--- /dev/null
+++ b/configs/inference/inference.yaml
@@ -0,0 +1,69 @@
+
+model:
+    height: 576
+    width: 576
+    original_height: 288
+    original_width: 512
+    cache_dir: "/homes/55/runjia/storage/svd_weights"
+    # pretrained_model_path: "stabilityai/stable-diffusion-2-1"
+    # pretrained_video_model_path: "stabilityai/stable-video-diffusion-img2vid"
+    
+    context_num_frames: 4
+    target_num_frames: 4
+    num_frames: 8
+    vae_spatial_scale: 8
+    latent_channels: 4
+    # num_ray_blocks: 2
+    vae_scale_factor: 8
+    inference_mode: false
+
+    temporal_only: false
+    use_non_maximum_suppression: true
+    translation_distance_weight: 0.1
+
+    camera_scale: 2.0
+    inference_num_steps: 50
+    cfg_min: 1.2
+    cfg: 3.0
+    guider_types: 1
+    
+    samples_dir: "./visualization"
+    save_flag: false
+    use_wandb: false
+
+    
+
+    # model_path: "/homes/55/runjia/storage/simview_weights/2025-04-30_12-08-55/checkpoint_230000.pth"
+    model_path: "liguang0115/vmem"
+
+
+surfel:
+    use_surfel: true
+    shrink_factor: 0.05
+    radius_scale: 0.5
+    conf_thresh: 1
+    merge_position_threshold: 0.2
+    merge_normal_threshold: 0.6
+    lr: 0.01
+    niter: 1000
+    model_path: "./extern/CUT3R/src/cut3r_512_dpt_4_64.pth"
+    width: 512
+    height: 288
+
+inference:
+    visualize: true
+    visualize_pointcloud: false
+    visualize_surfel: false
+    save_surfels: false
+    image_dir: "/homes/55/runjia/storage/realestate10k/video_data/test"
+    meta_info_dir: "/homes/55/runjia/storage/realestate10k/RealEstate10K/test"
+
+
+
+    
+
+
+
+
+visualization_dir: "./visualization"
+seed: 42
\ No newline at end of file
diff --git a/extern/CUT3R/.gitignore b/extern/CUT3R/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..0427a6e035828ae7266caf30e87b845857a17647
--- /dev/null
+++ b/extern/CUT3R/.gitignore
@@ -0,0 +1,55 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Translations
+*.mo
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+
+# Rope
+.ropeproject
+
+# Django stuff:
+*.log
+*.pot
+
+# Sphinx documentation
+docs/_build/
+
+# Ignore data and ckpts
+*.pth
+data
+src/checkpoints
\ No newline at end of file
diff --git a/extern/CUT3R/LICENSE b/extern/CUT3R/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..9dbb7ace0da6c2662916e038eee55f4f218f2f95
--- /dev/null
+++ b/extern/CUT3R/LICENSE
@@ -0,0 +1,6 @@
+Copyright [2025–present] 
+
+CUT3R is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 License.  
+
+To view a copy of the CC BY-NC-SA 4.0, visit:  
+https://creativecommons.org/licenses/by-nc-sa/4.0/  
\ No newline at end of file
diff --git a/extern/CUT3R/README.md b/extern/CUT3R/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2d06fbcfdde95d7e25173f41efc1efedb82e7307
--- /dev/null
+++ b/extern/CUT3R/README.md
@@ -0,0 +1,208 @@
+# Continuous 3D Perception Model with Persistent State
+<div align="center">
+  <img src="./assets/factory-ezgif.com-video-speed.gif"  alt="CUT3R" />
+</div>
+
+<hr>
+
+<br>
+Official implementation of <strong>Continuous 3D Perception Model with Persistent State</strong>, CVPR 2025 (Oral)
+
+[*QianqianWang**](https://qianqianwang68.github.io/),
+[*Yifei Zhang**](https://forrest-110.github.io/),
+[*Aleksander Holynski*](https://holynski.org/),
+[*Alexei A Efros*](https://people.eecs.berkeley.edu/~efros/),
+[*Angjoo Kanazawa*](https://people.eecs.berkeley.edu/~kanazawa/)
+
+
+(*: equal contribution)
+
+<div style="line-height: 1;">
+  <a href="https://cut3r.github.io/" target="_blank" style="margin: 2px;">
+    <img alt="Website" src="https://img.shields.io/badge/Website-CUT3R-536af5?color=536af5&logoColor=white" style="display: inline-block; vertical-align: middle;"/>
+  </a>
+  <a href="https://arxiv.org/pdf/2501.12387" target="_blank" style="margin: 2px;">
+    <img alt="Arxiv" src="https://img.shields.io/badge/Arxiv-CUT3R-red?logo=%23B31B1B" style="display: inline-block; vertical-align: middle;"/>
+  </a>
+</div>
+
+
+![Example of capabilities](assets/ezgif.com-video-to-gif-converter.gif)
+
+## Table of Contents
+- [TODO](#todo)
+- [Get Started](#getting-started)
+  - [Installation](#installation)
+  - [Checkpoints](#download-checkpoints)
+  - [Inference](#inference)
+- [Datasets](#datasets)
+- [Evaluation](#evaluation)
+  - [Datasets](#datasets-1)
+  - [Evaluation Scripts](#evaluation-scripts)
+- [Training and Fine-tuning](#training-and-fine-tuning)
+- [Acknowledgements](#acknowledgements)
+- [Citation](#citation)
+
+## TODO
+- [x] Release multi-view stereo results of DL3DV dataset.
+- [ ] Online demo integrated with WebCam
+
+## Getting Started
+
+### Installation
+
+1. Clone CUT3R.
+```bash
+git clone https://github.com/CUT3R/CUT3R.git
+cd CUT3R
+```
+
+2. Create the environment.
+```bash
+conda create -n cut3r python=3.11 cmake=3.14.0
+conda activate cut3r
+conda install pytorch torchvision pytorch-cuda=12.1 -c pytorch -c nvidia  # use the correct version of cuda for your system
+pip install -r requirements.txt
+# issues with pytorch dataloader, see https://github.com/pytorch/pytorch/issues/99625
+conda install 'llvm-openmp<16'
+# for training logging
+pip install git+https://github.com/nerfstudio-project/gsplat.git
+# for evaluation
+pip install evo
+pip install open3d
+```
+
+3. Compile the cuda kernels for RoPE (as in CroCo v2).
+```bash
+cd src/croco/models/curope/
+python setup.py build_ext --inplace
+cd ../../../../
+```
+
+### Download Checkpoints
+
+We currently provide checkpoints on Google Drive:
+
+| Modelname   | Training resolutions | #Views| Head |
+|-------------|----------------------|-------|------|
+| [`cut3r_224_linear_4.pth`](https://drive.google.com/file/d/11dAgFkWHpaOHsR6iuitlB_v4NFFBrWjy/view?usp=drive_link) | 224x224 | 16 | Linear |
+| [`cut3r_512_dpt_4_64.pth`](https://drive.google.com/file/d/1Asz-ZB3FfpzZYwunhQvNPZEUA8XUNAYD/view?usp=drive_link) | 512x384, 512x336, 512x288, 512x256, 512x160, 384x512, 336x512, 288x512, 256x512, 160x512 | 4-64 | DPT |
+
+> `cut3r_224_linear_4.pth` is our intermediate checkpoint and `cut3r_512_dpt_4_64.pth` is our final checkpoint.
+
+To download the weights, run the following commands:
+```bash
+cd src
+# for 224 linear ckpt
+gdown --fuzzy https://drive.google.com/file/d/11dAgFkWHpaOHsR6iuitlB_v4NFFBrWjy/view?usp=drive_link 
+# for 512 dpt ckpt
+gdown --fuzzy https://drive.google.com/file/d/1Asz-ZB3FfpzZYwunhQvNPZEUA8XUNAYD/view?usp=drive_link
+cd ..
+```
+
+### Inference
+
+To run the inference code, you can use the following command:
+```bash
+# the following script will run inference offline and visualize the output with viser on port 8080
+python demo.py --model_path MODEL_PATH --seq_path SEQ_PATH --size SIZE --vis_threshold VIS_THRESHOLD --output_dir OUT_DIR  # input can be a folder or a video
+# Example:
+#     python demo.py --model_path src/cut3r_512_dpt_4_64.pth --size 512 \
+#         --seq_path examples/001 --vis_threshold 1.5 --output_dir tmp
+#
+#     python demo.py --model_path src/cut3r_224_linear_4.pth --size 224 \
+#         --seq_path examples/001 --vis_threshold 1.5 --output_dir tmp
+
+# the following script will run inference with global alignment and visualize the output with viser on port 8080
+python demo_ga.py --model_path MODEL_PATH --seq_path SEQ_PATH --size SIZE --vis_threshold VIS_THRESHOLD --output_dir OUT_DIR
+```
+Output results will be saved to `output_dir`.
+
+> Currently, we accelerate the feedforward process by processing inputs in parallel within the encoder, which results in linear memory consumption as the number of frames increases.
+
+## Datasets
+Our training data includes 32 datasets listed below. We provide processing scripts for all of them. Please download the datasets from their official sources, and refer to [preprocess.md](docs/preprocess.md) for processing scripts and more information about the datasets.
+
+  - [ARKitScenes](https://github.com/apple/ARKitScenes) 
+  - [BlendedMVS](https://github.com/YoYo000/BlendedMVS)
+  - [CO3Dv2](https://github.com/facebookresearch/co3d)
+  - [MegaDepth](https://www.cs.cornell.edu/projects/megadepth/)
+  - [ScanNet++](https://kaldir.vc.in.tum.de/scannetpp/) 
+  - [ScanNet](http://www.scan-net.org/ScanNet/)
+  - [WayMo Open dataset](https://github.com/waymo-research/waymo-open-dataset)
+  - [WildRGB-D](https://github.com/wildrgbd/wildrgbd/)
+  - [Map-free](https://research.nianticlabs.com/mapfree-reloc-benchmark/dataset)
+  - [TartanAir](https://theairlab.org/tartanair-dataset/)
+  - [UnrealStereo4K](https://github.com/fabiotosi92/SMD-Nets) 
+  - [Virtual KITTI 2](https://europe.naverlabs.com/research/computer-vision/proxy-virtual-worlds-vkitti-2/)
+  - [3D Ken Burns](https://github.com/sniklaus/3d-ken-burns.git)
+  - [BEDLAM](https://bedlam.is.tue.mpg.de/)
+  - [COP3D](https://github.com/facebookresearch/cop3d)
+  - [DL3DV](https://github.com/DL3DV-10K/Dataset)
+  - [Dynamic Replica](https://github.com/facebookresearch/dynamic_stereo)
+  - [EDEN](https://lhoangan.github.io/eden/)
+  - [Hypersim](https://github.com/apple/ml-hypersim)
+  - [IRS](https://github.com/HKBU-HPML/IRS)
+  - [Matterport3D](https://niessner.github.io/Matterport/)
+  - [MVImgNet](https://github.com/GAP-LAB-CUHK-SZ/MVImgNet)
+  - [MVS-Synth](https://phuang17.github.io/DeepMVS/mvs-synth.html)
+  - [OmniObject3D](https://omniobject3d.github.io/)
+  - [PointOdyssey](https://pointodyssey.com/)
+  - [RealEstate10K](https://google.github.io/realestate10k/)
+  - [SmartPortraits](https://mobileroboticsskoltech.github.io/SmartPortraits/)
+  - [Spring](https://spring-benchmark.org/)
+  - [Synscapes](https://synscapes.on.liu.se/)
+  - [UASOL](https://osf.io/64532/)
+  - [UrbanSyn](https://www.urbansyn.org/)
+  - [HOI4D](https://hoi4d.github.io/)
+
+
+## Evaluation
+
+### Datasets
+Please follow [MonST3R](https://github.com/Junyi42/monst3r/blob/main/data/evaluation_script.md) and [Spann3R](https://github.com/HengyiWang/spann3r/blob/main/docs/data_preprocess.md) to prepare **Sintel**, **Bonn**, **KITTI**, **NYU-v2**, **TUM-dynamics**, **ScanNet**, **7scenes** and **Neural-RGBD** datasets.
+
+The datasets should be organized as follows:
+```
+data/
+├── 7scenes
+├── bonn
+├── kitti
+├── neural_rgbd
+├── nyu-v2
+├── scannetv2
+├── sintel
+└── tum
+```
+
+### Evaluation Scripts
+Please refer to the [eval.md](docs/eval.md) for more details.
+
+## Training and Fine-tuning
+Please refer to the [train.md](docs/train.md) for more details.
+
+## Acknowledgements
+Our code is based on the following awesome repositories:
+
+- [DUSt3R](https://github.com/naver/dust3r)
+- [MonST3R](https://github.com/Junyi42/monst3r.git)
+- [Spann3R](https://github.com/HengyiWang/spann3r.git)
+- [Viser](https://github.com/nerfstudio-project/viser)
+
+We thank the authors for releasing their code!
+
+
+
+## Citation
+
+If you find our work useful, please cite:
+
+```bibtex
+@article{wang2025continuous,
+  title={Continuous 3D Perception Model with Persistent State},
+  author={Wang, Qianqian and Zhang, Yifei and Holynski, Aleksander and Efros, Alexei A and Kanazawa, Angjoo},
+  journal={arXiv preprint arXiv:2501.12387},
+  year={2025}
+}
+```
+
diff --git a/extern/CUT3R/add_ckpt_path.py b/extern/CUT3R/add_ckpt_path.py
new file mode 100644
index 0000000000000000000000000000000000000000..e03e0e9e4f67499185f6d7a36cacaa04256b74af
--- /dev/null
+++ b/extern/CUT3R/add_ckpt_path.py
@@ -0,0 +1,9 @@
+import sys
+import os
+import os.path as path
+
+
+def add_path_to_dust3r(ckpt):
+    HERE_PATH = os.path.dirname(os.path.abspath(ckpt))
+    # workaround for sibling import
+    sys.path.insert(0, HERE_PATH)
diff --git a/extern/CUT3R/cloud_opt/base_opt.py b/extern/CUT3R/cloud_opt/base_opt.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fce7c57ed32ff0863dd58b97331c03b6e667c80
--- /dev/null
+++ b/extern/CUT3R/cloud_opt/base_opt.py
@@ -0,0 +1,301 @@
+from copy import deepcopy
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import roma
+from copy import deepcopy
+import tqdm
+import os
+import matplotlib.pyplot as plt
+
+from cloud_opt.utils import *
+from cloud_opt.utils import _check_edges, _compute_img_conf
+import cloud_opt.init_all as init_fun
+
+
+class BaseOptimizer(nn.Module):
+    """Optimize a global scene, given a graph-organized observations.
+    Graph node: images
+    Graph edges: observations = (pred1, pred2), pred2 is in pred1's coordinate
+    """
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def _init_from_views(
+        self,
+        view1s,
+        view2s,
+        pred1s,
+        pred2s,  # whatever predictions, they should be organized into pairwise for graph optimization
+        dist="l1",
+        conf="log",
+        min_conf_thr=3,
+        thr_for_init_conf=False,
+        base_scale=0.5,
+        allow_pw_adaptors=False,
+        pw_break=20,
+        rand_pose=torch.randn,
+        empty_cache=False,
+        verbose=True,
+    ):
+        super().__init__()
+        self.edges = [
+            (int(view1["idx"]), int(view2["idx"]))
+            for view1, view2 in zip(view1s, view2s)
+        ]
+        self.dist = ALL_DISTS[dist]
+        self.n_imgs = _check_edges(self.edges)
+
+        self.edge2pts_i = NoGradParamDict(
+            {ij: pred1s[n]["pts3d_is_self_view"] for n, ij in enumerate(self.str_edges)}
+        )  # ij: the name of the edge
+        self.edge2pts_j = NoGradParamDict(
+            {
+                ij: pred2s[n]["pts3d_in_other_view"]
+                for n, ij in enumerate(self.str_edges)
+            }
+        )
+        self.edge2conf_i = NoGradParamDict(
+            {ij: pred1s[n]["conf_self"] for n, ij in enumerate(self.str_edges)}
+        )
+        self.edge2conf_j = NoGradParamDict(
+            {ij: pred2s[n]["conf"] for n, ij in enumerate(self.str_edges)}
+        )
+
+        self.imshapes = get_imshapes(self.edges, pred1s, pred2s)
+        self.min_conf_thr = min_conf_thr
+        self.thr_for_init_conf = thr_for_init_conf
+        self.conf_trf = get_conf_trf(conf)
+
+        self.im_conf = _compute_img_conf(
+            self.imshapes, self.device, self.edges, self.edge2conf_i, self.edge2conf_j
+        )
+        for i in range(len(self.im_conf)):
+            self.im_conf[i].requires_grad = False
+
+        self.init_conf_maps = [c.clone() for c in self.im_conf]
+
+        self.base_scale = base_scale
+        self.norm_pw_scale = True
+        self.pw_break = pw_break
+        self.POSE_DIM = 7
+        self.pw_poses = nn.Parameter(
+            rand_pose((self.n_edges, 1 + self.POSE_DIM))
+        )  # pairwise poses
+        self.pw_adaptors = nn.Parameter(
+            torch.zeros((self.n_edges, 2))
+        )  # slight xy/z adaptation
+        self.pw_adaptors.requires_grad_(allow_pw_adaptors)
+        self.has_im_poses = False
+        self.rand_pose = rand_pose
+
+    def get_known_poses(self):
+        if self.has_im_poses:
+            known_poses_msk = torch.tensor(
+                [not (p.requires_grad) for p in self.im_poses]
+            )
+            known_poses = self.get_im_poses()
+            return known_poses_msk.sum(), known_poses_msk, known_poses
+        else:
+            return 0, None, None
+
+    def get_pw_norm_scale_factor(self):
+        if self.norm_pw_scale:
+            # normalize scales so that things cannot go south
+            # we want that exp(scale) ~= self.base_scale
+            return (np.log(self.base_scale) - self.pw_poses[:, -1].mean()).exp()
+        else:
+            return 1  # don't norm scale for known poses
+
+    def _set_pose(self, poses, idx, R, T=None, scale=None, force=False):
+        # all poses == cam-to-world
+        pose = poses[idx]
+        if not (pose.requires_grad or force):
+            return pose
+
+        if R.shape == (4, 4):
+            assert T is None
+            T = R[:3, 3]
+            R = R[:3, :3]
+
+        if R is not None:
+            pose.data[0:4] = roma.rotmat_to_unitquat(R)
+        if T is not None:
+            pose.data[4:7] = signed_log1p(
+                T / (scale or 1)
+            )  # translation is function of scale
+
+        if scale is not None:
+            assert poses.shape[-1] in (8, 13)
+            pose.data[-1] = np.log(float(scale))
+        return pose
+
+    def forward(self, ret_details=False):
+        pw_poses = self.get_pw_poses()  # cam-to-world
+        pw_adapt = self.get_adaptors()
+        proj_pts3d = self.get_pts3d()
+        # pre-compute pixel weights
+        weight_i = {i_j: self.conf_trf(c) for i_j, c in self.conf_i.items()}
+        weight_j = {i_j: self.conf_trf(c) for i_j, c in self.conf_j.items()}
+
+        loss = 0
+        if ret_details:
+            details = -torch.ones((self.n_imgs, self.n_imgs))
+
+        for e, (i, j) in enumerate(self.edges):
+            i_j = edge_str(i, j)
+            # distance in image i and j
+            aligned_pred_i = geotrf(pw_poses[e], pw_adapt[e] * self.pred_i[i_j])
+            aligned_pred_j = geotrf(pw_poses[e], pw_adapt[e] * self.pred_j[i_j])
+            li = self.dist(proj_pts3d[i], aligned_pred_i, weight=weight_i[i_j]).mean()
+            lj = self.dist(proj_pts3d[j], aligned_pred_j, weight=weight_j[i_j]).mean()
+            loss = loss + li + lj
+
+            if ret_details:
+                details[i, j] = li + lj
+        loss /= self.n_edges  # average over all pairs
+
+        if ret_details:
+            return loss, details
+        return loss
+
+    @torch.cuda.amp.autocast(enabled=False)
+    def compute_global_alignment(self, init=None, niter_PnP=10, **kw):
+        if init is None:
+            pass
+        elif init == "msp" or init == "mst":
+            init_fun.init_minimum_spanning_tree(self, niter_PnP=niter_PnP)
+        elif init == "known_poses":
+            raise NotImplementedError
+            self.preset_pose(known_poses=self.camera_poses, requires_grad=True)
+            init_fun.init_from_known_poses(
+                self, min_conf_thr=self.min_conf_thr, niter_PnP=niter_PnP
+            )
+        else:
+            raise ValueError(f"bad value for {init=}")
+
+        return global_alignment_loop(self, **kw)
+
+    @property
+    def str_edges(self):
+        return [edge_str(i, j) for i, j in self.edges]
+
+    @property
+    def n_edges(self):
+        return len(self.edges)
+
+
+def global_alignment_loop(
+    net,
+    lr=0.01,
+    niter=300,
+    schedule="cosine",
+    lr_min=1e-3,
+    temporal_smoothing_weight=0,
+    depth_map_save_dir=None,
+):
+    params = [p for p in net.parameters() if p.requires_grad]
+    if not params:
+        return net
+
+    verbose = net.verbose
+    if verbose:
+        print("Global alignement - optimizing for:")
+        print([name for name, value in net.named_parameters() if value.requires_grad])
+
+    lr_base = lr
+    optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.9))
+
+    loss = float("inf")
+    if verbose:
+        with tqdm.tqdm(total=niter) as bar:
+            while bar.n < bar.total:
+                if bar.n % 500 == 0 and depth_map_save_dir is not None:
+                    if not os.path.exists(depth_map_save_dir):
+                        os.makedirs(depth_map_save_dir)
+                    # visualize the depthmaps
+                    depth_maps = net.get_depthmaps()
+                    for i, depth_map in enumerate(depth_maps):
+                        depth_map_save_path = os.path.join(
+                            depth_map_save_dir, f"depthmaps_{i}_iter_{bar.n}.png"
+                        )
+                        plt.imsave(
+                            depth_map_save_path,
+                            depth_map.detach().cpu().numpy(),
+                            cmap="jet",
+                        )
+                    print(
+                        f"Saved depthmaps at iteration {bar.n} to {depth_map_save_dir}"
+                    )
+                loss, lr = global_alignment_iter(
+                    net,
+                    bar.n,
+                    niter,
+                    lr_base,
+                    lr_min,
+                    optimizer,
+                    schedule,
+                    temporal_smoothing_weight=temporal_smoothing_weight,
+                )
+                bar.set_postfix_str(f"{lr=:g} loss={loss:g}")
+                bar.update()
+    else:
+        for n in range(niter):
+            loss, _ = global_alignment_iter(
+                net,
+                n,
+                niter,
+                lr_base,
+                lr_min,
+                optimizer,
+                schedule,
+                temporal_smoothing_weight=temporal_smoothing_weight,
+            )
+    return loss
+
+
+def global_alignment_iter(
+    net,
+    cur_iter,
+    niter,
+    lr_base,
+    lr_min,
+    optimizer,
+    schedule,
+    temporal_smoothing_weight=0,
+):
+    t = cur_iter / niter
+    if schedule == "cosine":
+        lr = cosine_schedule(t, lr_base, lr_min)
+    elif schedule == "linear":
+        lr = linear_schedule(t, lr_base, lr_min)
+    elif schedule.startswith("cycle"):
+        try:
+            num_cycles = int(schedule[5:])
+        except ValueError:
+            num_cycles = 2
+        lr = cycled_linear_schedule(t, lr_base, lr_min, num_cycles=num_cycles)
+    else:
+        raise ValueError(f"bad lr {schedule=}")
+
+    adjust_learning_rate_by_lr(optimizer, lr)
+    optimizer.zero_grad()
+
+    if net.empty_cache:
+        torch.cuda.empty_cache()
+
+    loss = net(epoch=cur_iter)
+
+    if net.empty_cache:
+        torch.cuda.empty_cache()
+
+    loss.backward()
+
+    if net.empty_cache:
+        torch.cuda.empty_cache()
+
+    optimizer.step()
+
+    return float(loss), lr
diff --git a/extern/CUT3R/cloud_opt/commons.py b/extern/CUT3R/cloud_opt/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c1fb51aab60eea36b059f191efa1c0c048c14b7
--- /dev/null
+++ b/extern/CUT3R/cloud_opt/commons.py
@@ -0,0 +1,102 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utility functions for global alignment
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+
+
+def edge_str(i, j):
+    return f"{i}_{j}"
+
+
+def i_j_ij(ij):
+    return edge_str(*ij), ij
+
+
+def edge_conf(conf_i, conf_j, edge):
+    return float(conf_i[edge].mean() * conf_j[edge].mean())
+
+
+def compute_edge_scores(edges, conf_i, conf_j):
+    return {(i, j): edge_conf(conf_i, conf_j, e) for e, (i, j) in edges}
+
+
+def NoGradParamDict(x):
+    assert isinstance(x, dict)
+    return nn.ParameterDict(x).requires_grad_(False)
+
+
+def get_imshapes(edges, pred_i, pred_j):
+    n_imgs = max(max(e) for e in edges) + 1
+    imshapes = [None] * n_imgs
+    for e, (i, j) in enumerate(edges):
+        shape_i = tuple(pred_i[e].shape[0:2])
+        shape_j = tuple(pred_j[e].shape[0:2])
+        if imshapes[i]:
+            assert imshapes[i] == shape_i, f"incorrect shape for image {i}"
+        if imshapes[j]:
+            assert imshapes[j] == shape_j, f"incorrect shape for image {j}"
+        imshapes[i] = shape_i
+        imshapes[j] = shape_j
+    return imshapes
+
+
+def get_conf_trf(mode):
+    if mode == "log":
+
+        def conf_trf(x):
+            return x.log()
+
+    elif mode == "sqrt":
+
+        def conf_trf(x):
+            return x.sqrt()
+
+    elif mode == "m1":
+
+        def conf_trf(x):
+            return x - 1
+
+    elif mode in ("id", "none"):
+
+        def conf_trf(x):
+            return x
+
+    else:
+        raise ValueError(f"bad mode for {mode=}")
+    return conf_trf
+
+
+def l2_dist(a, b, weight):
+    return (a - b).square().sum(dim=-1) * weight
+
+
+def l1_dist(a, b, weight):
+    return (a - b).norm(dim=-1) * weight
+
+
+ALL_DISTS = dict(l1=l1_dist, l2=l2_dist)
+
+
+def signed_log1p(x):
+    sign = torch.sign(x)
+    return sign * torch.log1p(torch.abs(x))
+
+
+def signed_expm1(x):
+    sign = torch.sign(x)
+    return sign * torch.expm1(torch.abs(x))
+
+
+def cosine_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_end + (lr_start - lr_end) * (1 + np.cos(t * np.pi)) / 2
+
+
+def linear_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_start + (lr_end - lr_start) * t
diff --git a/extern/CUT3R/cloud_opt/dust3r_opt/__init__.py b/extern/CUT3R/cloud_opt/dust3r_opt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2b5dfafd9e75c96c58367352d78d35f91d1fc0b
--- /dev/null
+++ b/extern/CUT3R/cloud_opt/dust3r_opt/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# global alignment optimization wrapper function
+# --------------------------------------------------------
+from enum import Enum
+
+from .optimizer import PointCloudOptimizer
+
+
+class GlobalAlignerMode(Enum):
+    PointCloudOptimizer = "PointCloudOptimizer"
+    ModularPointCloudOptimizer = "ModularPointCloudOptimizer"
+    PairViewer = "PairViewer"
+
+
+def global_aligner(
+    dust3r_output, device, mode=GlobalAlignerMode.PointCloudOptimizer, **optim_kw
+):
+    # extract all inputs
+    view1, view2, pred1, pred2 = [
+        dust3r_output[k] for k in "view1 view2 pred1 pred2".split()
+    ]
+    # build the optimizer
+    if mode == GlobalAlignerMode.PointCloudOptimizer:
+        net = PointCloudOptimizer(view1, view2, pred1, pred2, **optim_kw).to(device)
+    else:
+        raise NotImplementedError(f"Unknown mode {mode}")
+
+    return net
diff --git a/extern/CUT3R/cloud_opt/dust3r_opt/base_opt.py b/extern/CUT3R/cloud_opt/dust3r_opt/base_opt.py
new file mode 100644
index 0000000000000000000000000000000000000000..1395a87c1bc7c456ce158a34705181af6ccbdc46
--- /dev/null
+++ b/extern/CUT3R/cloud_opt/dust3r_opt/base_opt.py
@@ -0,0 +1,620 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Base class for the global alignement procedure
+# --------------------------------------------------------
+from copy import deepcopy
+
+import numpy as np
+import torch
+import torch.nn as nn
+import roma
+from copy import deepcopy
+import tqdm
+import cv2
+from PIL import Image
+from dust3r.utils.geometry import inv, geotrf
+from dust3r.utils.device import to_numpy
+from dust3r.utils.image import rgb
+from dust3r.viz import SceneViz, segment_sky, auto_cam_size
+
+from cloud_opt.dust3r_opt.commons import (
+    edge_str,
+    ALL_DISTS,
+    NoGradParamDict,
+    get_imshapes,
+    signed_expm1,
+    signed_log1p,
+    cosine_schedule,
+    linear_schedule,
+    get_conf_trf,
+)
+import cloud_opt.dust3r_opt.init_im_poses as init_fun
+from pathlib import Path
+from scipy.spatial.transform import Rotation
+from evo.core.trajectory import PosePath3D, PoseTrajectory3D
+
+
+def adjust_learning_rate_by_lr(optimizer, lr):
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr
+
+
+def make_traj(args) -> PoseTrajectory3D:
+    if isinstance(args, tuple) or isinstance(args, list):
+        traj, tstamps = args
+        return PoseTrajectory3D(
+            positions_xyz=traj[:, :3],
+            orientations_quat_wxyz=traj[:, 3:],
+            timestamps=tstamps,
+        )
+    assert isinstance(args, PoseTrajectory3D), type(args)
+    return deepcopy(args)
+
+
+def save_trajectory_tum_format(traj, filename):
+    traj = make_traj(traj)
+    tostr = lambda a: " ".join(map(str, a))
+    with Path(filename).open("w") as f:
+        for i in range(traj.num_poses):
+            f.write(
+                f"{traj.timestamps[i]} {tostr(traj.positions_xyz[i])} {tostr(traj.orientations_quat_wxyz[i][[0,1,2,3]])}\n"
+            )
+    print(f"Saved trajectory to {filename}")
+
+
+def c2w_to_tumpose(c2w):
+    """
+    Convert a camera-to-world matrix to a tuple of translation and rotation
+
+    input: c2w: 4x4 matrix
+    output: tuple of translation and rotation (x y z qw qx qy qz)
+    """
+    # convert input to numpy
+    c2w = to_numpy(c2w)
+    xyz = c2w[:3, -1]
+    rot = Rotation.from_matrix(c2w[:3, :3])
+    qx, qy, qz, qw = rot.as_quat()
+    tum_pose = np.concatenate([xyz, [qw, qx, qy, qz]])
+    return tum_pose
+
+
+class BasePCOptimizer(nn.Module):
+    """Optimize a global scene, given a list of pairwise observations.
+    Graph node: images
+    Graph edges: observations = (pred1, pred2)
+    """
+
+    def __init__(self, *args, **kwargs):
+        if len(args) == 1 and len(kwargs) == 0:
+            other = deepcopy(args[0])
+            attrs = """edges is_symmetrized dist n_imgs pred_i pred_j imshapes 
+                        min_conf_thr conf_thr conf_i conf_j im_conf
+                        base_scale norm_pw_scale POSE_DIM pw_poses 
+                        pw_adaptors pw_adaptors has_im_poses rand_pose imgs verbose""".split()
+            self.__dict__.update({k: other[k] for k in attrs})
+        else:
+            self._init_from_views(*args, **kwargs)
+
+    def _init_from_views(
+        self,
+        view1,
+        view2,
+        pred1,
+        pred2,
+        dist="l1",
+        conf="log",
+        min_conf_thr=3,
+        base_scale=0.5,
+        allow_pw_adaptors=False,
+        pw_break=20,
+        rand_pose=torch.randn,
+        iterationsCount=None,
+        verbose=True,
+    ):
+        super().__init__()
+        if not isinstance(view1["idx"], list):
+            view1["idx"] = view1["idx"].tolist()
+        if not isinstance(view2["idx"], list):
+            view2["idx"] = view2["idx"].tolist()
+        self.edges = [(int(i), int(j)) for i, j in zip(view1["idx"], view2["idx"])]
+        self.is_symmetrized = set(self.edges) == {(j, i) for i, j in self.edges}
+        self.dist = ALL_DISTS[dist]
+        self.verbose = verbose
+
+        self.n_imgs = self._check_edges()
+
+        # input data
+        pred1_pts = pred1["pts3d_in_self_view"]
+        pred2_pts = pred2["pts3d_in_other_view"]
+        self.pred_i = NoGradParamDict(
+            {ij: pred1_pts[n] for n, ij in enumerate(self.str_edges)}
+        )
+        self.pred_j = NoGradParamDict(
+            {ij: pred2_pts[n] for n, ij in enumerate(self.str_edges)}
+        )
+        self.imshapes = get_imshapes(self.edges, pred1_pts, pred2_pts)
+
+        # work in log-scale with conf
+        pred1_conf = pred1["conf_self"]
+        pred2_conf = pred2["conf"]
+        self.min_conf_thr = min_conf_thr
+        self.conf_trf = get_conf_trf(conf)
+
+        self.conf_i = NoGradParamDict(
+            {ij: pred1_conf[n] for n, ij in enumerate(self.str_edges)}
+        )
+        self.conf_j = NoGradParamDict(
+            {ij: pred2_conf[n] for n, ij in enumerate(self.str_edges)}
+        )
+        self.im_conf = self._compute_img_conf(pred1_conf, pred2_conf)
+        for i in range(len(self.im_conf)):
+            self.im_conf[i].requires_grad = False
+
+        # pairwise pose parameters
+        self.base_scale = base_scale
+        self.norm_pw_scale = True
+        self.pw_break = pw_break
+        self.POSE_DIM = 7
+        self.pw_poses = nn.Parameter(
+            rand_pose((self.n_edges, 1 + self.POSE_DIM))
+        )  # pairwise poses
+        self.pw_adaptors = nn.Parameter(
+            torch.zeros((self.n_edges, 2))
+        )  # slight xy/z adaptation
+        self.pw_adaptors.requires_grad_(allow_pw_adaptors)
+        self.has_im_poses = False
+        self.rand_pose = rand_pose
+
+        # possibly store images for show_pointcloud
+        self.imgs = None
+        if "img" in view1 and "img" in view2:
+            imgs = [torch.zeros((3,) + hw) for hw in self.imshapes]
+            for v in range(len(self.edges)):
+                idx = view1["idx"][v]
+                imgs[idx] = view1["img"][v]
+                idx = view2["idx"][v]
+                imgs[idx] = view2["img"][v]
+            self.imgs = rgb(imgs)
+
+    @property
+    def n_edges(self):
+        return len(self.edges)
+
+    @property
+    def str_edges(self):
+        return [edge_str(i, j) for i, j in self.edges]
+
+    @property
+    def imsizes(self):
+        return [(w, h) for h, w in self.imshapes]
+
+    @property
+    def device(self):
+        return next(iter(self.parameters())).device
+
+    def state_dict(self, trainable=True):
+        all_params = super().state_dict()
+        return {
+            k: v
+            for k, v in all_params.items()
+            if k.startswith(("_", "pred_i.", "pred_j.", "conf_i.", "conf_j."))
+            != trainable
+        }
+
+    def load_state_dict(self, data):
+        return super().load_state_dict(self.state_dict(trainable=False) | data)
+
+    def _check_edges(self):
+        indices = sorted({i for edge in self.edges for i in edge})
+        assert indices == list(range(len(indices))), "bad pair indices: missing values "
+        return len(indices)
+
+    @torch.no_grad()
+    def _compute_img_conf(self, pred1_conf, pred2_conf):
+        im_conf = nn.ParameterList(
+            [torch.zeros(hw, device=self.device) for hw in self.imshapes]
+        )
+        for e, (i, j) in enumerate(self.edges):
+            im_conf[i] = torch.maximum(im_conf[i], pred1_conf[e])
+            im_conf[j] = torch.maximum(im_conf[j], pred2_conf[e])
+        return im_conf
+
+    def get_adaptors(self):
+        adapt = self.pw_adaptors
+        adapt = torch.cat(
+            (adapt[:, 0:1], adapt), dim=-1
+        )  # (scale_xy, scale_xy, scale_z)
+        if self.norm_pw_scale:  # normalize so that the product == 1
+            adapt = adapt - adapt.mean(dim=1, keepdim=True)
+        return (adapt / self.pw_break).exp()
+
+    def _get_poses(self, poses):
+        # normalize rotation
+        Q = poses[:, :4]
+        T = signed_expm1(poses[:, 4:7])
+        RT = roma.RigidUnitQuat(Q, T).normalize().to_homogeneous()
+        return RT
+
+    def _set_pose(self, poses, idx, R, T=None, scale=None, force=False):
+        # all poses == cam-to-world
+        pose = poses[idx]
+        if not (pose.requires_grad or force):
+            return pose
+
+        if R.shape == (4, 4):
+            assert T is None
+            T = R[:3, 3]
+            R = R[:3, :3]
+
+        if R is not None:
+            pose.data[0:4] = roma.rotmat_to_unitquat(R)
+        if T is not None:
+            pose.data[4:7] = signed_log1p(
+                T / (scale or 1)
+            )  # translation is function of scale
+
+        if scale is not None:
+            assert poses.shape[-1] in (8, 13)
+            pose.data[-1] = np.log(float(scale))
+        return pose
+
+    def get_pw_norm_scale_factor(self):
+        if self.norm_pw_scale:
+            # normalize scales so that things cannot go south
+            # we want that exp(scale) ~= self.base_scale
+            return (np.log(self.base_scale) - self.pw_poses[:, -1].mean()).exp()
+        else:
+            return 1  # don't norm scale for known poses
+
+    def get_pw_scale(self):
+        scale = self.pw_poses[:, -1].exp()  # (n_edges,)
+        scale = scale * self.get_pw_norm_scale_factor()
+        return scale
+
+    def get_pw_poses(self):  # cam to world
+        RT = self._get_poses(self.pw_poses)
+        scaled_RT = RT.clone()
+        scaled_RT[:, :3] *= self.get_pw_scale().view(
+            -1, 1, 1
+        )  # scale the rotation AND translation
+        return scaled_RT
+
+    def get_masks(self):
+        return [(conf > self.min_conf_thr) for conf in self.im_conf]
+
+    def depth_to_pts3d(self):
+        raise NotImplementedError()
+
+    def get_pts3d(self, raw=False):
+        res = self.depth_to_pts3d()
+        if not raw:
+            res = [dm[: h * w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+
+    def _set_focal(self, idx, focal, force=False):
+        raise NotImplementedError()
+
+    def get_focals(self):
+        raise NotImplementedError()
+
+    def get_known_focal_mask(self):
+        raise NotImplementedError()
+
+    def get_principal_points(self):
+        raise NotImplementedError()
+
+    def get_conf(self, mode=None):
+        trf = self.conf_trf if mode is None else get_conf_trf(mode)
+        return [trf(c) for c in self.im_conf]
+
+    def get_im_poses(self):
+        raise NotImplementedError()
+
+    def _set_depthmap(self, idx, depth, force=False):
+        raise NotImplementedError()
+
+    def get_depthmaps(self, raw=False):
+        raise NotImplementedError()
+
+    def save_depth_maps(self, path):
+        depth_maps = self.get_depthmaps()
+        images = []
+
+        for i, depth_map in enumerate(depth_maps):
+            # Apply color map to depth map
+            depth_map_colored = cv2.applyColorMap(
+                (depth_map * 255).detach().cpu().numpy().astype(np.uint8),
+                cv2.COLORMAP_JET,
+            )
+            img_path = f"{path}/frame_{(i):04d}.png"
+            cv2.imwrite(img_path, depth_map_colored)
+            images.append(Image.open(img_path))
+            np.save(f"{path}/frame_{(i):04d}.npy", depth_map.detach().cpu().numpy())
+
+        images[0].save(
+            f"{path}/_depth_maps.gif",
+            save_all=True,
+            append_images=images[1:],
+            duration=100,
+            loop=0,
+        )
+
+        return depth_maps
+
+    def clean_pointcloud(self, **kw):
+        cams = inv(self.get_im_poses())
+        K = self.get_intrinsics()
+        depthmaps = self.get_depthmaps()
+        all_pts3d = self.get_pts3d()
+
+        new_im_confs = clean_pointcloud(
+            self.im_conf, K, cams, depthmaps, all_pts3d, **kw
+        )
+        for i, new_conf in enumerate(new_im_confs):
+            self.im_conf[i].data[:] = new_conf
+        return self
+
+    def get_tum_poses(self):
+        poses = self.get_im_poses()
+        tt = np.arange(len(poses)).astype(float)
+        tum_poses = [c2w_to_tumpose(p) for p in poses]
+        tum_poses = np.stack(tum_poses, 0)
+        return [tum_poses, tt]
+
+    def save_tum_poses(self, path):
+        traj = self.get_tum_poses()
+        save_trajectory_tum_format(traj, path)
+        return traj[0]  # return the poses
+
+    def save_focals(self, path):
+        # convert focal to txt
+        focals = self.get_focals()
+        np.savetxt(path, focals.detach().cpu().numpy(), fmt="%.6f")
+        return focals
+
+    def save_intrinsics(self, path):
+        K_raw = self.get_intrinsics()
+        K = K_raw.reshape(-1, 9)
+        np.savetxt(path, K.detach().cpu().numpy(), fmt="%.6f")
+        return K_raw
+
+    def save_conf_maps(self, path):
+        conf = self.get_conf()
+        for i, c in enumerate(conf):
+            np.save(f"{path}/conf_{i}.npy", c.detach().cpu().numpy())
+        return conf
+
+    def save_init_conf_maps(self, path):
+        conf = self.get_init_conf()
+        for i, c in enumerate(conf):
+            np.save(f"{path}/init_conf_{i}.npy", c.detach().cpu().numpy())
+        return conf
+
+    def save_rgb_imgs(self, path):
+        imgs = self.imgs
+        for i, img in enumerate(imgs):
+            # convert from rgb to bgr
+            img = img[..., ::-1]
+            cv2.imwrite(f"{path}/frame_{i:04d}.png", img * 255)
+        return imgs
+
+    def save_dynamic_masks(self, path):
+        dynamic_masks = (
+            self.dynamic_masks
+            if getattr(self, "sam2_dynamic_masks", None) is None
+            else self.sam2_dynamic_masks
+        )
+        for i, dynamic_mask in enumerate(dynamic_masks):
+            cv2.imwrite(
+                f"{path}/dynamic_mask_{i}.png",
+                (dynamic_mask * 255).detach().cpu().numpy().astype(np.uint8),
+            )
+        return dynamic_masks
+
+    def save_depth_maps(self, path):
+        depth_maps = self.get_depthmaps()
+        images = []
+
+        for i, depth_map in enumerate(depth_maps):
+            # Apply color map to depth map
+            depth_map_colored = cv2.applyColorMap(
+                (depth_map * 255).detach().cpu().numpy().astype(np.uint8),
+                cv2.COLORMAP_JET,
+            )
+            img_path = f"{path}/frame_{(i):04d}.png"
+            cv2.imwrite(img_path, depth_map_colored)
+            images.append(Image.open(img_path))
+            np.save(f"{path}/frame_{(i):04d}.npy", depth_map.detach().cpu().numpy())
+
+        images[0].save(
+            f"{path}/_depth_maps.gif",
+            save_all=True,
+            append_images=images[1:],
+            duration=100,
+            loop=0,
+        )
+
+        return depth_maps
+
+    def forward(self, ret_details=False):
+        pw_poses = self.get_pw_poses()  # cam-to-world
+        pw_adapt = self.get_adaptors()
+        proj_pts3d = self.get_pts3d()
+        # pre-compute pixel weights
+        weight_i = {i_j: self.conf_trf(c) for i_j, c in self.conf_i.items()}
+        weight_j = {i_j: self.conf_trf(c) for i_j, c in self.conf_j.items()}
+
+        loss = 0
+        if ret_details:
+            details = -torch.ones((self.n_imgs, self.n_imgs))
+
+        for e, (i, j) in enumerate(self.edges):
+            i_j = edge_str(i, j)
+            # distance in image i and j
+            aligned_pred_i = geotrf(pw_poses[e], pw_adapt[e] * self.pred_i[i_j])
+            aligned_pred_j = geotrf(pw_poses[e], pw_adapt[e] * self.pred_j[i_j])
+            li = self.dist(proj_pts3d[i], aligned_pred_i, weight=weight_i[i_j]).mean()
+            lj = self.dist(proj_pts3d[j], aligned_pred_j, weight=weight_j[i_j]).mean()
+            loss = loss + li + lj
+
+            if ret_details:
+                details[i, j] = li + lj
+        loss /= self.n_edges  # average over all pairs
+
+        if ret_details:
+            return loss, details
+        return loss
+
+    @torch.cuda.amp.autocast(enabled=False)
+    def compute_global_alignment(self, init=None, niter_PnP=10, **kw):
+        if init is None:
+            pass
+        elif init == "msp" or init == "mst":
+            init_fun.init_minimum_spanning_tree(self, niter_PnP=niter_PnP)
+        elif init == "known_poses":
+            init_fun.init_from_known_poses(
+                self, min_conf_thr=self.min_conf_thr, niter_PnP=niter_PnP
+            )
+        else:
+            raise ValueError(f"bad value for {init=}")
+        return global_alignment_loop(self, **kw)
+
+    @torch.no_grad()
+    def mask_sky(self):
+        res = deepcopy(self)
+        for i in range(self.n_imgs):
+            sky = segment_sky(self.imgs[i])
+            res.im_conf[i][sky] = 0
+        return res
+
+    def show(self, show_pw_cams=False, show_pw_pts3d=False, cam_size=None, **kw):
+        viz = SceneViz()
+        if self.imgs is None:
+            colors = np.random.randint(0, 256, size=(self.n_imgs, 3))
+            colors = list(map(tuple, colors.tolist()))
+            for n in range(self.n_imgs):
+                viz.add_pointcloud(self.get_pts3d()[n], colors[n], self.get_masks()[n])
+        else:
+            viz.add_pointcloud(self.get_pts3d(), self.imgs, self.get_masks())
+            colors = np.random.randint(256, size=(self.n_imgs, 3))
+
+        # camera poses
+        im_poses = to_numpy(self.get_im_poses())
+        if cam_size is None:
+            cam_size = auto_cam_size(im_poses)
+        viz.add_cameras(
+            im_poses,
+            self.get_focals(),
+            colors=colors,
+            images=self.imgs,
+            imsizes=self.imsizes,
+            cam_size=cam_size,
+        )
+        if show_pw_cams:
+            pw_poses = self.get_pw_poses()
+            viz.add_cameras(pw_poses, color=(192, 0, 192), cam_size=cam_size)
+
+            if show_pw_pts3d:
+                pts = [
+                    geotrf(pw_poses[e], self.pred_i[edge_str(i, j)])
+                    for e, (i, j) in enumerate(self.edges)
+                ]
+                viz.add_pointcloud(pts, (128, 0, 128))
+
+        viz.show(**kw)
+        return viz
+
+
+def global_alignment_loop(net, lr=0.01, niter=300, schedule="cosine", lr_min=1e-6):
+    params = [p for p in net.parameters() if p.requires_grad]
+    if not params:
+        return net
+
+    verbose = net.verbose
+    if verbose:
+        print("Global alignement - optimizing for:")
+        print([name for name, value in net.named_parameters() if value.requires_grad])
+
+    lr_base = lr
+    optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.9))
+
+    loss = float("inf")
+    if verbose:
+        with tqdm.tqdm(total=niter) as bar:
+            while bar.n < bar.total:
+                loss, lr = global_alignment_iter(
+                    net, bar.n, niter, lr_base, lr_min, optimizer, schedule
+                )
+                bar.set_postfix_str(f"{lr=:g} loss={loss:g}")
+                bar.update()
+    else:
+        for n in range(niter):
+            loss, _ = global_alignment_iter(
+                net, n, niter, lr_base, lr_min, optimizer, schedule
+            )
+    return loss
+
+
+def global_alignment_iter(net, cur_iter, niter, lr_base, lr_min, optimizer, schedule):
+    t = cur_iter / niter
+    if schedule == "cosine":
+        lr = cosine_schedule(t, lr_base, lr_min)
+    elif schedule == "linear":
+        lr = linear_schedule(t, lr_base, lr_min)
+    else:
+        raise ValueError(f"bad lr {schedule=}")
+    adjust_learning_rate_by_lr(optimizer, lr)
+    optimizer.zero_grad()
+    loss = net()
+    loss.backward()
+    optimizer.step()
+
+    return float(loss), lr
+
+
+@torch.no_grad()
+def clean_pointcloud(
+    im_confs, K, cams, depthmaps, all_pts3d, tol=0.001, bad_conf=0, dbg=()
+):
+    """Method:
+    1) express all 3d points in each camera coordinate frame
+    2) if they're in front of a depthmap --> then lower their confidence
+    """
+    assert len(im_confs) == len(cams) == len(K) == len(depthmaps) == len(all_pts3d)
+    assert 0 <= tol < 1
+    res = [c.clone() for c in im_confs]
+
+    # reshape appropriately
+    all_pts3d = [p.view(*c.shape, 3) for p, c in zip(all_pts3d, im_confs)]
+    depthmaps = [d.view(*c.shape) for d, c in zip(depthmaps, im_confs)]
+
+    for i, pts3d in enumerate(all_pts3d):
+        for j in range(len(all_pts3d)):
+            if i == j:
+                continue
+
+            # project 3dpts in other view
+            proj = geotrf(cams[j], pts3d)
+            proj_depth = proj[:, :, 2]
+            u, v = geotrf(K[j], proj, norm=1, ncol=2).round().long().unbind(-1)
+
+            # check which points are actually in the visible cone
+            H, W = im_confs[j].shape
+            msk_i = (proj_depth > 0) & (0 <= u) & (u < W) & (0 <= v) & (v < H)
+            msk_j = v[msk_i], u[msk_i]
+
+            # find bad points = those in front but less confident
+            bad_points = (proj_depth[msk_i] < (1 - tol) * depthmaps[j][msk_j]) & (
+                res[i][msk_i] < res[j][msk_j]
+            )
+
+            bad_msk_i = msk_i.clone()
+            bad_msk_i[msk_i] = bad_points
+            res[i][bad_msk_i] = res[i][bad_msk_i].clip_(max=bad_conf)
+
+    return res
diff --git a/extern/CUT3R/cloud_opt/dust3r_opt/commons.py b/extern/CUT3R/cloud_opt/dust3r_opt/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c1fb51aab60eea36b059f191efa1c0c048c14b7
--- /dev/null
+++ b/extern/CUT3R/cloud_opt/dust3r_opt/commons.py
@@ -0,0 +1,102 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utility functions for global alignment
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+
+
+def edge_str(i, j):
+    return f"{i}_{j}"
+
+
+def i_j_ij(ij):
+    return edge_str(*ij), ij
+
+
+def edge_conf(conf_i, conf_j, edge):
+    return float(conf_i[edge].mean() * conf_j[edge].mean())
+
+
+def compute_edge_scores(edges, conf_i, conf_j):
+    return {(i, j): edge_conf(conf_i, conf_j, e) for e, (i, j) in edges}
+
+
+def NoGradParamDict(x):
+    assert isinstance(x, dict)
+    return nn.ParameterDict(x).requires_grad_(False)
+
+
+def get_imshapes(edges, pred_i, pred_j):
+    n_imgs = max(max(e) for e in edges) + 1
+    imshapes = [None] * n_imgs
+    for e, (i, j) in enumerate(edges):
+        shape_i = tuple(pred_i[e].shape[0:2])
+        shape_j = tuple(pred_j[e].shape[0:2])
+        if imshapes[i]:
+            assert imshapes[i] == shape_i, f"incorrect shape for image {i}"
+        if imshapes[j]:
+            assert imshapes[j] == shape_j, f"incorrect shape for image {j}"
+        imshapes[i] = shape_i
+        imshapes[j] = shape_j
+    return imshapes
+
+
+def get_conf_trf(mode):
+    if mode == "log":
+
+        def conf_trf(x):
+            return x.log()
+
+    elif mode == "sqrt":
+
+        def conf_trf(x):
+            return x.sqrt()
+
+    elif mode == "m1":
+
+        def conf_trf(x):
+            return x - 1
+
+    elif mode in ("id", "none"):
+
+        def conf_trf(x):
+            return x
+
+    else:
+        raise ValueError(f"bad mode for {mode=}")
+    return conf_trf
+
+
+def l2_dist(a, b, weight):
+    return (a - b).square().sum(dim=-1) * weight
+
+
+def l1_dist(a, b, weight):
+    return (a - b).norm(dim=-1) * weight
+
+
+ALL_DISTS = dict(l1=l1_dist, l2=l2_dist)
+
+
+def signed_log1p(x):
+    sign = torch.sign(x)
+    return sign * torch.log1p(torch.abs(x))
+
+
+def signed_expm1(x):
+    sign = torch.sign(x)
+    return sign * torch.expm1(torch.abs(x))
+
+
+def cosine_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_end + (lr_start - lr_end) * (1 + np.cos(t * np.pi)) / 2
+
+
+def linear_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_start + (lr_end - lr_start) * t
diff --git a/extern/CUT3R/cloud_opt/dust3r_opt/init_im_poses.py b/extern/CUT3R/cloud_opt/dust3r_opt/init_im_poses.py
new file mode 100644
index 0000000000000000000000000000000000000000..41ce872a8abd8105c54040d21272517f315e0962
--- /dev/null
+++ b/extern/CUT3R/cloud_opt/dust3r_opt/init_im_poses.py
@@ -0,0 +1,382 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Initialization functions for global alignment
+# --------------------------------------------------------
+from functools import cache
+
+import numpy as np
+import scipy.sparse as sp
+import torch
+import cv2
+import roma
+from tqdm import tqdm
+
+from dust3r.utils.geometry import geotrf, inv, get_med_dist_between_poses
+from dust3r.post_process import estimate_focal_knowing_depth
+from dust3r.viz import to_numpy
+
+from cloud_opt.commons import edge_str, i_j_ij, compute_edge_scores
+
+
+@torch.no_grad()
+def init_from_known_poses(self, niter_PnP=10, min_conf_thr=3):
+    device = self.device
+
+    # indices of known poses
+    nkp, known_poses_msk, known_poses = get_known_poses(self)
+    assert nkp == self.n_imgs, "not all poses are known"
+
+    # get all focals
+    nkf, _, im_focals = get_known_focals(self)
+    assert nkf == self.n_imgs
+    im_pp = self.get_principal_points()
+
+    best_depthmaps = {}
+    # init all pairwise poses
+    for e, (i, j) in enumerate(tqdm(self.edges, disable=not self.verbose)):
+        i_j = edge_str(i, j)
+
+        # find relative pose for this pair
+        P1 = torch.eye(4, device=device)
+        msk = self.conf_i[i_j] > min(min_conf_thr, self.conf_i[i_j].min() - 0.1)
+        _, P2 = fast_pnp(
+            self.pred_j[i_j],
+            float(im_focals[i].mean()),
+            pp=im_pp[i],
+            msk=msk,
+            device=device,
+            niter_PnP=niter_PnP,
+        )
+
+        # align the two predicted camera with the two gt cameras
+        s, R, T = align_multiple_poses(torch.stack((P1, P2)), known_poses[[i, j]])
+        # normally we have known_poses[i] ~= sRT_to_4x4(s,R,T,device) @ P1
+        # and geotrf(sRT_to_4x4(1,R,T,device), s*P2[:3,3])
+        self._set_pose(self.pw_poses, e, R, T, scale=s)
+
+        # remember if this is a good depthmap
+        score = float(self.conf_i[i_j].mean())
+        if score > best_depthmaps.get(i, (0,))[0]:
+            best_depthmaps[i] = score, i_j, s
+
+    # init all image poses
+    for n in range(self.n_imgs):
+        assert known_poses_msk[n]
+        _, i_j, scale = best_depthmaps[n]
+        depth = self.pred_i[i_j][:, :, 2]
+        self._set_depthmap(n, depth * scale)
+
+
+@torch.no_grad()
+def init_minimum_spanning_tree(self, **kw):
+    """Init all camera poses (image-wise and pairwise poses) given
+    an initial set of pairwise estimations.
+    """
+    device = self.device
+    pts3d, _, im_focals, im_poses = minimum_spanning_tree(
+        self.imshapes,
+        self.edges,
+        self.pred_i,
+        self.pred_j,
+        self.conf_i,
+        self.conf_j,
+        self.im_conf,
+        self.min_conf_thr,
+        device,
+        has_im_poses=self.has_im_poses,
+        verbose=self.verbose,
+        **kw,
+    )
+
+    return init_from_pts3d(self, pts3d, im_focals, im_poses)
+
+
+def init_from_pts3d(self, pts3d, im_focals, im_poses):
+    # init poses
+    nkp, known_poses_msk, known_poses = get_known_poses(self)
+    if nkp == 1:
+        raise NotImplementedError(
+            "Would be simpler to just align everything afterwards on the single known pose"
+        )
+    elif nkp > 1:
+        # global rigid SE3 alignment
+        s, R, T = align_multiple_poses(
+            im_poses[known_poses_msk], known_poses[known_poses_msk]
+        )
+        trf = sRT_to_4x4(s, R, T, device=known_poses.device)
+
+        # rotate everything
+        im_poses = trf @ im_poses
+        im_poses[:, :3, :3] /= s  # undo scaling on the rotation part
+        for img_pts3d in pts3d:
+            img_pts3d[:] = geotrf(trf, img_pts3d)
+
+    # set all pairwise poses
+    for e, (i, j) in enumerate(self.edges):
+        i_j = edge_str(i, j)
+        # compute transform that goes from cam to world
+        s, R, T = rigid_points_registration(
+            self.pred_i[i_j], pts3d[i], conf=self.conf_i[i_j]
+        )
+        self._set_pose(self.pw_poses, e, R, T, scale=s)
+
+    # take into account the scale normalization
+    s_factor = self.get_pw_norm_scale_factor()
+    im_poses[:, :3, 3] *= s_factor  # apply downscaling factor
+    for img_pts3d in pts3d:
+        img_pts3d *= s_factor
+
+    # init all image poses
+    if self.has_im_poses:
+        for i in range(self.n_imgs):
+            cam2world = im_poses[i]
+            depth = geotrf(inv(cam2world), pts3d[i])[..., 2]
+            self._set_depthmap(i, depth)
+            self._set_pose(self.im_poses, i, cam2world)
+            if im_focals[i] is not None:
+                self._set_focal(i, im_focals[i])
+
+    if self.verbose:
+        pass
+        # print(' init loss =', float(self()))
+
+
+def minimum_spanning_tree(
+    imshapes,
+    edges,
+    pred_i,
+    pred_j,
+    conf_i,
+    conf_j,
+    im_conf,
+    min_conf_thr,
+    device,
+    has_im_poses=True,
+    niter_PnP=10,
+    verbose=True,
+):
+    n_imgs = len(imshapes)
+    sparse_graph = -dict_to_sparse_graph(
+        compute_edge_scores(map(i_j_ij, edges), conf_i, conf_j)
+    )
+    print(sparse_graph)
+    msp = sp.csgraph.minimum_spanning_tree(sparse_graph).tocoo()
+
+    # temp variable to store 3d points
+    pts3d = [None] * len(imshapes)
+
+    todo = sorted(zip(-msp.data, msp.row, msp.col))  # sorted edges
+    im_poses = [None] * n_imgs
+    im_focals = [None] * n_imgs
+
+    # init with strongest edge
+    score, i, j = todo.pop()
+    if verbose:
+        print(f" init edge ({i}*,{j}*) {score=}")
+    i_j = edge_str(i, j)
+    pts3d[i] = pred_i[i_j].clone()
+    pts3d[j] = pred_j[i_j].clone()
+    done = {i, j}
+    if has_im_poses:
+        im_poses[i] = torch.eye(4, device=device)
+        im_focals[i] = estimate_focal(pred_i[i_j])
+
+    # set initial pointcloud based on pairwise graph
+    msp_edges = [(i, j)]
+    while todo:
+        # each time, predict the next one
+        score, i, j = todo.pop()
+
+        if im_focals[i] is None:
+            im_focals[i] = estimate_focal(pred_i[i_j])
+
+        if i in done:
+            if verbose:
+                print(f" init edge ({i},{j}*) {score=}")
+            assert j not in done
+            # align pred[i] with pts3d[i], and then set j accordingly
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(pred_i[i_j], pts3d[i], conf=conf_i[i_j])
+            trf = sRT_to_4x4(s, R, T, device)
+            pts3d[j] = geotrf(trf, pred_j[i_j])
+            done.add(j)
+            msp_edges.append((i, j))
+
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+
+        elif j in done:
+            if verbose:
+                print(f" init edge ({i}*,{j}) {score=}")
+            assert i not in done
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(pred_j[i_j], pts3d[j], conf=conf_j[i_j])
+            trf = sRT_to_4x4(s, R, T, device)
+            pts3d[i] = geotrf(trf, pred_i[i_j])
+            done.add(i)
+            msp_edges.append((i, j))
+
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+        else:
+            # let's try again later
+            todo.insert(0, (score, i, j))
+
+    if has_im_poses:
+        # complete all missing informations
+        pair_scores = list(
+            sparse_graph.values()
+        )  # already negative scores: less is best
+        edges_from_best_to_worse = np.array(list(sparse_graph.keys()))[
+            np.argsort(pair_scores)
+        ]
+        for i, j in edges_from_best_to_worse.tolist():
+            if im_focals[i] is None:
+                im_focals[i] = estimate_focal(pred_i[edge_str(i, j)])
+
+        for i in range(n_imgs):
+            if im_poses[i] is None:
+                msk = im_conf[i] > min_conf_thr
+                res = fast_pnp(
+                    pts3d[i], im_focals[i], msk=msk, device=device, niter_PnP=niter_PnP
+                )
+                if res:
+                    im_focals[i], im_poses[i] = res
+            if im_poses[i] is None:
+                im_poses[i] = torch.eye(4, device=device)
+        im_poses = torch.stack(im_poses)
+    else:
+        im_poses = im_focals = None
+
+    return pts3d, msp_edges, im_focals, im_poses
+
+
+def dict_to_sparse_graph(dic):
+    n_imgs = max(max(e) for e in dic) + 1
+    res = sp.dok_array((n_imgs, n_imgs))
+    for edge, value in dic.items():
+        res[edge] = value
+    return res
+
+
+def rigid_points_registration(pts1, pts2, conf):
+    R, T, s = roma.rigid_points_registration(
+        pts1.reshape(-1, 3),
+        pts2.reshape(-1, 3),
+        weights=conf.ravel(),
+        compute_scaling=True,
+    )
+    return s, R, T  # return un-scaled (R, T)
+
+
+def sRT_to_4x4(scale, R, T, device):
+    trf = torch.eye(4, device=device)
+    trf[:3, :3] = R * scale
+    trf[:3, 3] = T.ravel()  # doesn't need scaling
+    return trf
+
+
+def estimate_focal(pts3d_i, pp=None):
+    if pp is None:
+        H, W, THREE = pts3d_i.shape
+        assert THREE == 3
+        pp = torch.tensor((W / 2, H / 2), device=pts3d_i.device)
+    focal = estimate_focal_knowing_depth(
+        pts3d_i.unsqueeze(0), pp.unsqueeze(0), focal_mode="weiszfeld"
+    ).ravel()
+    return float(focal)
+
+
+@cache
+def pixel_grid(H, W):
+    return np.mgrid[:W, :H].T.astype(np.float32)
+
+
+def fast_pnp(pts3d, focal, msk, device, pp=None, niter_PnP=10):
+    # extract camera poses and focals with RANSAC-PnP
+    if msk.sum() < 4:
+        return None  # we need at least 4 points for PnP
+    pts3d, msk = map(to_numpy, (pts3d, msk))
+
+    H, W, THREE = pts3d.shape
+    assert THREE == 3
+    pixels = pixel_grid(H, W)
+
+    if focal is None:
+        S = max(W, H)
+        tentative_focals = np.geomspace(S / 2, S * 3, 21)
+    else:
+        tentative_focals = [focal]
+
+    if pp is None:
+        pp = (W / 2, H / 2)
+    else:
+        pp = to_numpy(pp)
+
+    best = (0,)
+    for focal in tentative_focals:
+        K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
+        try:
+            success, R, T, inliers = cv2.solvePnPRansac(
+                pts3d[msk],
+                pixels[msk],
+                K,
+                None,
+                iterationsCount=niter_PnP,
+                reprojectionError=5,
+                flags=cv2.SOLVEPNP_SQPNP,
+            )
+            if not success:
+                continue
+        except:
+            continue
+
+        score = len(inliers)
+        if success and score > best[0]:
+            best = score, R, T, focal
+
+    if not best[0]:
+        return None
+
+    _, R, T, best_focal = best
+    R = cv2.Rodrigues(R)[0]  # world to cam
+    R, T = map(torch.from_numpy, (R, T))
+    return best_focal, inv(sRT_to_4x4(1, R, T, device))  # cam to world
+
+
+def get_known_poses(self):
+    if self.has_im_poses:
+        known_poses_msk = torch.tensor([not (p.requires_grad) for p in self.im_poses])
+        known_poses = self.get_im_poses()
+        return known_poses_msk.sum(), known_poses_msk, known_poses
+    else:
+        return 0, None, None
+
+
+def get_known_focals(self):
+    if self.has_im_poses:
+        known_focal_msk = self.get_known_focal_mask()
+        known_focals = self.get_focals()
+        return known_focal_msk.sum(), known_focal_msk, known_focals
+    else:
+        return 0, None, None
+
+
+def align_multiple_poses(src_poses, target_poses):
+    N = len(src_poses)
+    assert src_poses.shape == target_poses.shape == (N, 4, 4)
+
+    def center_and_z(poses):
+        # Add small epsilon to prevent division by zero when all poses are at origin
+        eps = max(get_med_dist_between_poses(poses) / 100, 1e-6)
+        return torch.cat((poses[:, :3, 3], poses[:, :3, 3] + eps * poses[:, :3, 2]))
+
+    R, T, s = roma.rigid_points_registration(
+        center_and_z(src_poses), center_and_z(target_poses), compute_scaling=True
+    )
+    # If scale is too small (near zero), set it to 1 to prevent numerical issues
+    if abs(s) < 1e-6:
+        s = 1.0
+    return s, R, T
diff --git a/extern/CUT3R/cloud_opt/dust3r_opt/optimizer.py b/extern/CUT3R/cloud_opt/dust3r_opt/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dcbae8746929bf2f9d1d62ebefcbf32bf857146
--- /dev/null
+++ b/extern/CUT3R/cloud_opt/dust3r_opt/optimizer.py
@@ -0,0 +1,341 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Main class for the implementation of the global alignment
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+
+from cloud_opt.dust3r_opt.base_opt import BasePCOptimizer
+from dust3r.utils.geometry import xy_grid, geotrf
+from dust3r.utils.device import to_cpu, to_numpy
+
+
+class PointCloudOptimizer(BasePCOptimizer):
+    """Optimize a global scene, given a list of pairwise observations.
+    Graph node: images
+    Graph edges: observations = (pred1, pred2)
+    """
+
+    def __init__(self, *args, optimize_pp=False, focal_break=20, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.has_im_poses = True  # by definition of this class
+        self.focal_break = focal_break
+
+        # adding thing to optimize
+        self.im_depthmaps = nn.ParameterList(
+            torch.randn(H, W) / 10 - 3 for H, W in self.imshapes
+        )  # log(depth)
+        self.im_poses = nn.ParameterList(
+            self.rand_pose(self.POSE_DIM) for _ in range(self.n_imgs)
+        )  # camera poses
+        self.im_focals = nn.ParameterList(
+            torch.FloatTensor([self.focal_break * np.log(max(H, W))])
+            for H, W in self.imshapes
+        )  # camera intrinsics
+        self.im_pp = nn.ParameterList(
+            torch.zeros((2,)) for _ in range(self.n_imgs)
+        )  # camera intrinsics
+        self.im_pp.requires_grad_(optimize_pp)
+
+        self.imshape = self.imshapes[0]
+        im_areas = [h * w for h, w in self.imshapes]
+        self.max_area = max(im_areas)
+
+        # adding thing to optimize
+        # self.im_depthmaps = ParameterStack(
+        #     self.im_depthmaps, is_param=True, fill=self.max_area
+        # )
+
+        self.im_poses = ParameterStack(self.im_poses, is_param=True)
+        self.im_focals = ParameterStack(self.im_focals, is_param=True)
+        self.im_pp = ParameterStack(self.im_pp, is_param=True)
+        self.register_buffer(
+            "_pp", torch.tensor([(w / 2, h / 2) for h, w in self.imshapes])
+        )
+        self.register_buffer(
+            "_grid",
+            ParameterStack(
+                [xy_grid(W, H, device=self.device) for H, W in self.imshapes],
+                fill=self.max_area,
+            ),
+        )
+
+        # pre-compute pixel weights
+        self.register_buffer(
+            "_weight_i",
+            ParameterStack(
+                [self.conf_trf(self.conf_i[i_j]) for i_j in self.str_edges],
+                fill=self.max_area,
+            ),
+        )
+        self.register_buffer(
+            "_weight_j",
+            ParameterStack(
+                [self.conf_trf(self.conf_j[i_j]) for i_j in self.str_edges],
+                fill=self.max_area,
+            ),
+        )
+
+        # precompute aa
+        self.register_buffer(
+            "_stacked_pred_i",
+            ParameterStack(self.pred_i, self.str_edges, fill=self.max_area),
+        )
+        self.register_buffer(
+            "_stacked_pred_j",
+            ParameterStack(self.pred_j, self.str_edges, fill=self.max_area),
+        )
+        self.register_buffer("_ei", torch.tensor([i for i, j in self.edges]))
+        self.register_buffer("_ej", torch.tensor([j for i, j in self.edges]))
+        self.total_area_i = sum([im_areas[i] for i, j in self.edges])
+        self.total_area_j = sum([im_areas[j] for i, j in self.edges])
+
+    def _check_all_imgs_are_selected(self, msk):
+        assert np.all(
+            self._get_msk_indices(msk) == np.arange(self.n_imgs)
+        ), "incomplete mask!"
+
+    def preset_pose(self, known_poses, pose_msk=None):  # cam-to-world
+        self._check_all_imgs_are_selected(pose_msk)
+
+        if isinstance(known_poses, torch.Tensor) and known_poses.ndim == 2:
+            known_poses = [known_poses]
+        for idx, pose in zip(self._get_msk_indices(pose_msk), known_poses):
+            if self.verbose:
+                print(f" (setting pose #{idx} = {pose[:3,3]})")
+            self._no_grad(self._set_pose(self.im_poses, idx, torch.tensor(pose)))
+
+        # normalize scale if there's less than 1 known pose
+        self.im_poses.requires_grad_(False)
+        for p in self.im_poses:
+            print(p.requires_grad)
+            print(p.data)
+        n_known_poses = sum((p.requires_grad is False) for p in self.im_poses)
+        self.norm_pw_scale = n_known_poses <= 1
+
+        
+        self.norm_pw_scale = False
+
+    def preset_focal(self, known_focals, msk=None):
+        self._check_all_imgs_are_selected(msk)
+
+        for idx, focal in zip(self._get_msk_indices(msk), known_focals):
+            if self.verbose:
+                print(f" (setting focal #{idx} = {focal})")
+            self._no_grad(self._set_focal(idx, focal))
+
+        self.im_focals.requires_grad_(False)
+
+    def preset_principal_point(self, known_pp, msk=None):
+        self._check_all_imgs_are_selected(msk)
+
+        for idx, pp in zip(self._get_msk_indices(msk), known_pp):
+            if self.verbose:
+                print(f" (setting principal point #{idx} = {pp})")
+            self._no_grad(self._set_principal_point(idx, pp))
+
+        self.im_pp.requires_grad_(False)
+
+
+        
+
+    def _get_msk_indices(self, msk):
+        if msk is None:
+            return range(self.n_imgs)
+        elif isinstance(msk, int):
+            return [msk]
+        elif isinstance(msk, (tuple, list)):
+            return self._get_msk_indices(np.array(msk))
+        elif msk.dtype in (bool, torch.bool, np.bool_):
+            assert len(msk) == self.n_imgs
+            return np.where(msk)[0]
+        elif np.issubdtype(msk.dtype, np.integer):
+            return msk
+        else:
+            raise ValueError(f"bad {msk=}")
+
+    def _no_grad(self, tensor):
+        assert (
+            tensor.requires_grad
+        ), "it must be True at this point, otherwise no modification occurs"
+
+    def _set_focal(self, idx, focal, force=False):
+        param = self.im_focals[idx]
+        if (
+            param.requires_grad or force
+        ):  # can only init a parameter not already initialized
+            param.data[:] = self.focal_break * np.log(focal)
+        return param
+
+    def get_focals(self):
+        log_focals = torch.stack(list(self.im_focals), dim=0)
+        return (log_focals / self.focal_break).exp()
+
+    def get_known_focal_mask(self):
+        return torch.tensor([not (p.requires_grad) for p in self.im_focals])
+
+    def _set_principal_point(self, idx, pp, force=False):
+        param = self.im_pp[idx]
+        H, W = self.imshapes[idx]
+        if (
+            param.requires_grad or force
+        ):  # can only init a parameter not already initialized
+            param.data[:] = to_cpu(to_numpy(pp) - (W / 2, H / 2)) / 10
+        return param
+
+    def get_principal_points(self):
+        return self._pp + 10 * self.im_pp
+
+    def get_intrinsics(self):
+        K = torch.zeros((self.n_imgs, 3, 3), device=self.device)
+        focals = self.get_focals().flatten()
+        K[:, 0, 0] = K[:, 1, 1] = focals
+        K[:, :2, 2] = self.get_principal_points()
+        K[:, 2, 2] = 1
+        return K
+
+    def get_im_poses(self):  # cam to world
+        cam2world = self._get_poses(self.im_poses)
+        return cam2world
+
+
+    def preset_depth(self, known_depths, msk=None):
+        """Preset known depth maps for specified images.
+        
+        Args:
+            known_depths: List of depth maps or single depth map (should be in normal depth space, not log space)
+            msk: Mask or indices indicating which images to preset. If None, applies to all images.
+        """
+        self._check_all_imgs_are_selected(msk)
+
+        if isinstance(known_depths, (torch.Tensor, np.ndarray)) and known_depths.ndim == 2:
+            known_depths = [known_depths]
+            
+        for idx, depth in zip(self._get_msk_indices(msk), known_depths):
+            if self.verbose:
+                print(f" (setting depth #{idx})")
+            # No need to take log here since _set_depthmap already expects depths in normal space
+            depth = _ravel_hw(depth, self.max_area).view(self.imshapes[idx])
+            self._no_grad(self._set_depthmap(idx, torch.tensor(depth)))
+            self.im_depthmaps[idx].requires_grad_(False)
+
+
+    def _set_depthmap(self, idx, depth, force=False):
+        """Set a depth map for an image.
+        
+        Args:
+            idx: Image index
+            depth: Depth map in normal space (not log space)
+            force: Whether to force setting even if already initialized
+        """
+        depth = _ravel_hw(depth, self.max_area)
+        depth = depth.view(self.imshapes[idx])
+        depth = depth.nan_to_num(neginf=0)
+        param = self.im_depthmaps[idx]
+        if (
+            param.requires_grad or force
+        ):  # can only init a parameter not already initialized
+            param.data[:] = depth.log().nan_to_num(neginf=0)  # Store in log space
+        return param
+
+    def get_depthmaps(self, raw=False):
+        res = ParameterStack(self.im_depthmaps, is_param=False).exp()
+        if not raw:
+            res = [dm[: h * w].view(h, w) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+
+    def depth_to_pts3d(self):
+        # Get depths and  projection params if not provided
+        focals = self.get_focals()
+        pp = self.get_principal_points()
+        im_poses = self.get_im_poses()
+        depth = self.get_depthmaps(raw=True)
+
+        # get pointmaps in camera frame
+        rel_ptmaps = _fast_depthmap_to_pts3d(depth, self._grid, focals, pp=pp)
+        # project to world frame
+        return geotrf(im_poses, rel_ptmaps)
+
+    def get_pts3d(self, raw=False):
+        res = self.depth_to_pts3d()
+        if not raw:
+            res = [dm[: h * w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+
+    def forward(self):
+        pw_poses = self.get_pw_poses()  # cam-to-world
+        pw_adapt = self.get_adaptors().unsqueeze(1)
+        proj_pts3d = self.get_pts3d(raw=True)
+
+        # rotate pairwise prediction according to pw_poses
+        aligned_pred_i = geotrf(pw_poses, pw_adapt * self._stacked_pred_i)
+        aligned_pred_j = geotrf(pw_poses, pw_adapt * self._stacked_pred_j)
+
+        # compute the less
+        li = (
+            self.dist(proj_pts3d[self._ei], aligned_pred_i, weight=self._weight_i).sum()
+            / self.total_area_i
+        )
+        lj = (
+            self.dist(proj_pts3d[self._ej], aligned_pred_j, weight=self._weight_j).sum()
+            / self.total_area_j
+        )
+
+        return li + lj
+
+
+def _fast_depthmap_to_pts3d(depth, pixel_grid, focal, pp):
+    pp = pp.unsqueeze(1)
+    focal = focal.unsqueeze(1)
+    if depth.ndim == 3:
+        depth = depth.view(depth.shape[0], -1)
+    assert focal.shape == (len(depth), 1, 1)
+    assert pp.shape == (len(depth), 1, 2)
+    assert pixel_grid.shape == depth.shape + (2,)
+    depth = depth.unsqueeze(-1)
+    return torch.cat((depth * (pixel_grid - pp) / focal, depth), dim=-1)
+
+
+def ParameterStack(params, keys=None, is_param=None, fill=0):
+    if keys is not None:
+        params = [params[k] for k in keys]
+
+    if fill > 0:
+        params = [_ravel_hw(p, fill) for p in params]
+
+    requires_grad = params[0].requires_grad
+    assert all(p.requires_grad == requires_grad for p in params) if is_param else True
+
+    params = torch.stack(list(params)).float().detach()
+    if is_param or requires_grad:
+        params = nn.Parameter(params)
+        params.requires_grad_(requires_grad)
+    return params
+
+
+def _ravel_hw(tensor, fill=0):
+    # ravel H,W
+    tensor = tensor.view((tensor.shape[0] * tensor.shape[1],) + tensor.shape[2:])
+
+    if len(tensor) < fill:
+        tensor = torch.cat(
+            (tensor, tensor.new_zeros((fill - len(tensor),) + tensor.shape[1:]))
+        )
+    return tensor
+
+
+def acceptable_focal_range(H, W, minf=0.5, maxf=3.5):
+    focal_base = max(H, W) / (
+        2 * np.tan(np.deg2rad(60) / 2)
+    )  # size / 1.1547005383792515
+    return minf * focal_base, maxf * focal_base
+
+
+def apply_mask(img, msk):
+    img = img.copy()
+    img[msk] = 0
+    return img
diff --git a/extern/CUT3R/cloud_opt/init_all.py b/extern/CUT3R/cloud_opt/init_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..090cc4c52683a83e19ea0bff0bb908987c64c05d
--- /dev/null
+++ b/extern/CUT3R/cloud_opt/init_all.py
@@ -0,0 +1,222 @@
+from functools import cache
+import numpy as np
+import scipy.sparse as sp
+import torch
+import cv2
+import roma
+from tqdm import tqdm
+
+from cloud_opt.utils import *
+
+
+def compute_edge_scores(edges, edge2conf_i, edge2conf_j):
+    """
+    edges: 'i_j', (i,j)
+    """
+    score_dict = {
+        (i, j): edge_conf(edge2conf_i[e], edge2conf_j[e]) for e, (i, j) in edges
+    }
+    return score_dict
+
+
+def dict_to_sparse_graph(dic):
+    n_imgs = max(max(e) for e in dic) + 1
+    res = sp.dok_array((n_imgs, n_imgs))
+    for edge, value in dic.items():
+        res[edge] = value
+    return res
+
+
+@torch.no_grad()
+def init_minimum_spanning_tree(self, **kw):
+    """Init all camera poses (image-wise and pairwise poses) given
+    an initial set of pairwise estimations.
+    """
+    device = self.device
+    pts3d, _, im_focals, im_poses = minimum_spanning_tree(
+        self.imshapes,
+        self.edges,
+        self.edge2pts_i,
+        self.edge2pts_j,
+        self.edge2conf_i,
+        self.edge2conf_j,
+        self.im_conf,
+        self.min_conf_thr,
+        device,
+        has_im_poses=self.has_im_poses,
+        verbose=self.verbose,
+        **kw,
+    )
+
+    return init_from_pts3d(self, pts3d, im_focals, im_poses)
+
+
+def minimum_spanning_tree(
+    imshapes,
+    edges,
+    edge2pred_i,
+    edge2pred_j,
+    edge2conf_i,
+    edge2conf_j,
+    im_conf,
+    min_conf_thr,
+    device,
+    has_im_poses=True,
+    niter_PnP=10,
+    verbose=True,
+    save_score_path=None,
+):
+    n_imgs = len(imshapes)
+    eadge_and_scores = compute_edge_scores(map(i_j_ij, edges), edge2conf_i, edge2conf_j)
+    sparse_graph = -dict_to_sparse_graph(eadge_and_scores)
+    msp = sp.csgraph.minimum_spanning_tree(sparse_graph).tocoo()
+
+    # temp variable to store 3d points
+    pts3d = [None] * len(imshapes)
+
+    todo = sorted(zip(-msp.data, msp.row, msp.col))  # sorted edges
+    im_poses = [None] * n_imgs
+    im_focals = [None] * n_imgs
+
+    # init with strongest edge
+    score, i, j = todo.pop()
+    if verbose:
+        print(f" init edge ({i}*,{j}*) {score=}")
+    i_j = edge_str(i, j)
+
+    pts3d[i] = edge2pred_i[i_j].clone()
+    pts3d[j] = edge2pred_j[i_j].clone()
+    done = {i, j}
+    if has_im_poses:
+        im_poses[i] = torch.eye(4, device=device)
+        im_focals[i] = estimate_focal(edge2pred_i[i_j])
+
+    # set initial pointcloud based on pairwise graph
+    msp_edges = [(i, j)]
+    while todo:
+        # each time, predict the next one
+        score, i, j = todo.pop()
+
+        if im_focals[i] is None:
+            im_focals[i] = estimate_focal(edge2pred_i[i_j])
+
+        if i in done:
+            if verbose:
+                print(f" init edge ({i},{j}*) {score=}")
+            assert j not in done
+            # align pred[i] with pts3d[i], and then set j accordingly
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(
+                edge2pred_i[i_j], pts3d[i], conf=edge2conf_i[i_j]
+            )
+            trf = sRT_to_4x4(s, R, T, device)
+            pts3d[j] = geotrf(trf, edge2pred_j[i_j])
+            done.add(j)
+            msp_edges.append((i, j))
+
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+
+        elif j in done:
+            if verbose:
+                print(f" init edge ({i}*,{j}) {score=}")
+            assert i not in done
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(
+                edge2pred_j[i_j], pts3d[j], conf=edge2conf_j[i_j]
+            )
+            trf = sRT_to_4x4(s, R, T, device)
+            pts3d[i] = geotrf(trf, edge2pred_i[i_j])
+            done.add(i)
+            msp_edges.append((i, j))
+
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+        else:
+            # let's try again later
+            todo.insert(0, (score, i, j))
+
+    if has_im_poses:
+        # complete all missing informations
+        pair_scores = list(
+            sparse_graph.values()
+        )  # already negative scores: less is best
+        edges_from_best_to_worse = np.array(list(sparse_graph.keys()))[
+            np.argsort(pair_scores)
+        ]
+        for i, j in edges_from_best_to_worse.tolist():
+            if im_focals[i] is None:
+                im_focals[i] = estimate_focal(edge2pred_i[edge_str(i, j)])
+
+        for i in range(n_imgs):
+            if im_poses[i] is None:
+                msk = im_conf[i] > min_conf_thr
+                res = fast_pnp(
+                    pts3d[i], im_focals[i], msk=msk, device=device, niter_PnP=niter_PnP
+                )
+                if res:
+                    im_focals[i], im_poses[i] = res
+            if im_poses[i] is None:
+                im_poses[i] = torch.eye(4, device=device)
+        im_poses = torch.stack(im_poses)
+    else:
+        im_poses = im_focals = None
+
+    return pts3d, msp_edges, im_focals, im_poses
+
+
+def init_from_pts3d(self, pts3d, im_focals, im_poses):
+    # init poses
+    nkp, known_poses_msk, known_poses = self.get_known_poses()
+    if nkp == 1:
+        raise NotImplementedError(
+            "Would be simpler to just align everything afterwards on the single known pose"
+        )
+    elif nkp > 1:
+        # global rigid SE3 alignment
+        s, R, T = align_multiple_poses(
+            im_poses[known_poses_msk], known_poses[known_poses_msk]
+        )
+        trf = sRT_to_4x4(s, R, T, device=known_poses.device)
+
+        # rotate everything
+        im_poses = trf @ im_poses
+        im_poses[:, :3, :3] /= s  # undo scaling on the rotation part
+        for img_pts3d in pts3d:
+            img_pts3d[:] = geotrf(trf, img_pts3d)
+    else:
+        pass  # no known poses
+
+    # set all pairwise poses
+    for e, (i, j) in enumerate(self.edges):
+        i_j = edge_str(i, j)
+        # compute transform that goes from cam to world
+        s, R, T = rigid_points_registration(
+            self.pred_i[i_j], pts3d[i], conf=self.conf_i[i_j]
+        )
+        self._set_pose(self.pw_poses, e, R, T, scale=s)
+
+    # take into account the scale normalization
+    s_factor = self.get_pw_norm_scale_factor()
+    im_poses[:, :3, 3] *= s_factor  # apply downscaling factor
+    for img_pts3d in pts3d:
+        img_pts3d *= s_factor
+
+    # init all image poses
+    if self.has_im_poses:
+        for i in range(self.n_imgs):
+            cam2world = im_poses[i]
+            depth = geotrf(inv(cam2world), pts3d[i])[..., 2]
+            self._set_depthmap(i, depth)
+            self._set_pose(self.im_poses, i, cam2world)
+            if im_focals[i] is not None:
+                if not self.shared_focal:
+                    self._set_focal(i, im_focals[i])
+        if self.shared_focal:
+            self._set_focal(0, sum(im_focals) / self.n_imgs)
+        if self.n_imgs > 2:
+            self._set_init_depthmap()
+
+    if self.verbose:
+        with torch.no_grad():
+            print(" init loss =", float(self()))
diff --git a/extern/CUT3R/cloud_opt/utils.py b/extern/CUT3R/cloud_opt/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f685265072ad78345b5cd6fd13e8e7b28a3a030d
--- /dev/null
+++ b/extern/CUT3R/cloud_opt/utils.py
@@ -0,0 +1,443 @@
+import torch.nn as nn
+import torch
+import roma
+import numpy as np
+import cv2
+from functools import cache
+
+
+def todevice(batch, device, callback=None, non_blocking=False):
+    """Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy).
+
+    batch: list, tuple, dict of tensors or other things
+    device: pytorch device or 'numpy'
+    callback: function that would be called on every sub-elements.
+    """
+    if callback:
+        batch = callback(batch)
+
+    if isinstance(batch, dict):
+        return {k: todevice(v, device) for k, v in batch.items()}
+
+    if isinstance(batch, (tuple, list)):
+        return type(batch)(todevice(x, device) for x in batch)
+
+    x = batch
+    if device == "numpy":
+        if isinstance(x, torch.Tensor):
+            x = x.detach().cpu().numpy()
+    elif x is not None:
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x)
+        if torch.is_tensor(x):
+            x = x.to(device, non_blocking=non_blocking)
+    return x
+
+
+to_device = todevice  # alias
+
+
+def to_numpy(x):
+    return todevice(x, "numpy")
+
+
+def to_cpu(x):
+    return todevice(x, "cpu")
+
+
+def to_cuda(x):
+    return todevice(x, "cuda")
+
+
+def signed_log1p(x):
+    sign = torch.sign(x)
+    return sign * torch.log1p(torch.abs(x))
+
+
+def l2_dist(a, b, weight):
+    return (a - b).square().sum(dim=-1) * weight
+
+
+def l1_dist(a, b, weight):
+    return (a - b).norm(dim=-1) * weight
+
+
+ALL_DISTS = dict(l1=l1_dist, l2=l2_dist)
+
+
+def _check_edges(edges):
+    indices = sorted({i for edge in edges for i in edge})
+    assert indices == list(range(len(indices))), "bad pair indices: missing values "
+    return len(indices)
+
+
+def NoGradParamDict(x):
+    assert isinstance(x, dict)
+    return nn.ParameterDict(x).requires_grad_(False)
+
+
+def edge_str(i, j):
+    return f"{i}_{j}"
+
+
+def i_j_ij(ij):
+    # inputs are (i, j)
+    return edge_str(*ij), ij
+
+
+def edge_conf(conf_i, conf_j):
+    score = float(conf_i.mean() * conf_j.mean())
+    return score
+
+
+def get_imshapes(edges, pred_i, pred_j):
+    n_imgs = max(max(e) for e in edges) + 1
+    imshapes = [None] * n_imgs
+    for e, (i, j) in enumerate(edges):
+        shape_i = tuple(pred_i[e]["pts3d_is_self_view"].shape[0:2])
+        shape_j = tuple(pred_j[e]["pts3d_in_other_view"].shape[0:2])
+        if imshapes[i]:
+            assert imshapes[i] == shape_i, f"incorrect shape for image {i}"
+        if imshapes[j]:
+            assert imshapes[j] == shape_j, f"incorrect shape for image {j}"
+        imshapes[i] = shape_i
+        imshapes[j] = shape_j
+    return imshapes
+
+
+def get_conf_trf(mode):
+    if mode == "log":
+
+        def conf_trf(x):
+            return x.log()
+
+    elif mode == "sqrt":
+
+        def conf_trf(x):
+            return x.sqrt()
+
+    elif mode == "m1":
+
+        def conf_trf(x):
+            return x - 1
+
+    elif mode in ("id", "none"):
+
+        def conf_trf(x):
+            return x
+
+    else:
+        raise ValueError(f"bad mode for {mode=}")
+    return conf_trf
+
+
+@torch.no_grad()
+def _compute_img_conf(imshapes, device, edges, edge2conf_i, edge2conf_j):
+    im_conf = nn.ParameterList([torch.zeros(hw, device=device) for hw in imshapes])
+    for e, (i, j) in enumerate(edges):
+        im_conf[i] = torch.maximum(im_conf[i], edge2conf_i[edge_str(i, j)])
+        im_conf[j] = torch.maximum(im_conf[j], edge2conf_j[edge_str(i, j)])
+    return im_conf
+
+
+def xy_grid(
+    W,
+    H,
+    device=None,
+    origin=(0, 0),
+    unsqueeze=None,
+    cat_dim=-1,
+    homogeneous=False,
+    **arange_kw,
+):
+    """Output a (H,W,2) array of int32
+    with output[j,i,0] = i + origin[0]
+         output[j,i,1] = j + origin[1]
+    """
+    if device is None:
+        # numpy
+        arange, meshgrid, stack, ones = np.arange, np.meshgrid, np.stack, np.ones
+    else:
+        # torch
+        arange = lambda *a, **kw: torch.arange(*a, device=device, **kw)
+        meshgrid, stack = torch.meshgrid, torch.stack
+        ones = lambda *a: torch.ones(*a, device=device)
+
+    tw, th = [arange(o, o + s, **arange_kw) for s, o in zip((W, H), origin)]
+    grid = meshgrid(tw, th, indexing="xy")
+    if homogeneous:
+        grid = grid + (ones((H, W)),)
+    if unsqueeze is not None:
+        grid = (grid[0].unsqueeze(unsqueeze), grid[1].unsqueeze(unsqueeze))
+    if cat_dim is not None:
+        grid = stack(grid, cat_dim)
+    return grid
+
+
+def estimate_focal_knowing_depth(
+    pts3d, pp, focal_mode="median", min_focal=0.0, max_focal=np.inf
+):
+    """Reprojection method, for when the absolute depth is known:
+    1) estimate the camera focal using a robust estimator
+    2) reproject points onto true rays, minimizing a certain error
+    """
+    B, H, W, THREE = pts3d.shape
+    assert THREE == 3
+
+    # centered pixel grid
+    pixels = xy_grid(W, H, device=pts3d.device).view(1, -1, 2) - pp.view(
+        -1, 1, 2
+    )  # B,HW,2
+    pts3d = pts3d.flatten(1, 2)  # (B, HW, 3)
+
+    if focal_mode == "median":
+        with torch.no_grad():
+            # direct estimation of focal
+            u, v = pixels.unbind(dim=-1)
+            x, y, z = pts3d.unbind(dim=-1)
+            fx_votes = (u * z) / x
+            fy_votes = (v * z) / y
+
+            # assume square pixels, hence same focal for X and Y
+            f_votes = torch.cat((fx_votes.view(B, -1), fy_votes.view(B, -1)), dim=-1)
+            focal = torch.nanmedian(f_votes, dim=-1).values
+
+    elif focal_mode == "weiszfeld":
+        # init focal with l2 closed form
+        # we try to find focal = argmin Sum | pixel - focal * (x,y)/z|
+        xy_over_z = (pts3d[..., :2] / pts3d[..., 2:3]).nan_to_num(
+            posinf=0, neginf=0
+        )  # homogeneous (x,y,1)
+
+        dot_xy_px = (xy_over_z * pixels).sum(dim=-1)
+        dot_xy_xy = xy_over_z.square().sum(dim=-1)
+
+        focal = dot_xy_px.mean(dim=1) / dot_xy_xy.mean(dim=1)
+
+        # iterative re-weighted least-squares
+        for iter in range(10):
+            # re-weighting by inverse of distance
+            dis = (pixels - focal.view(-1, 1, 1) * xy_over_z).norm(dim=-1)
+            # print(dis.nanmean(-1))
+            w = dis.clip(min=1e-8).reciprocal()
+            # update the scaling with the new weights
+            focal = (w * dot_xy_px).mean(dim=1) / (w * dot_xy_xy).mean(dim=1)
+    else:
+        raise ValueError(f"bad {focal_mode=}")
+
+    focal_base = max(H, W) / (
+        2 * np.tan(np.deg2rad(60) / 2)
+    )  # size / 1.1547005383792515
+    focal = focal.clip(min=min_focal * focal_base, max=max_focal * focal_base)
+    # print(focal)
+    return focal
+
+
+def estimate_focal(pts3d_i, pp=None):
+    if pp is None:
+        H, W, THREE = pts3d_i.shape
+        assert THREE == 3
+        pp = torch.tensor((W / 2, H / 2), device=pts3d_i.device)
+    focal = estimate_focal_knowing_depth(
+        pts3d_i.unsqueeze(0), pp.unsqueeze(0), focal_mode="weiszfeld"
+    ).ravel()
+    return float(focal)
+
+
+def rigid_points_registration(pts1, pts2, conf):
+    R, T, s = roma.rigid_points_registration(
+        pts1.reshape(-1, 3),
+        pts2.reshape(-1, 3),
+        weights=conf.ravel(),
+        compute_scaling=True,
+    )
+    return s, R, T  # return un-scaled (R, T)
+
+
+def sRT_to_4x4(scale, R, T, device):
+    trf = torch.eye(4, device=device)
+    trf[:3, :3] = R * scale
+    trf[:3, 3] = T.ravel()  # doesn't need scaling
+    return trf
+
+
+def geotrf(Trf, pts, ncol=None, norm=False):
+    """Apply a geometric transformation to a list of 3-D points.
+
+    H: 3x3 or 4x4 projection matrix (typically a Homography)
+    p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3)
+
+    ncol: int. number of columns of the result (2 or 3)
+    norm: float. if != 0, the resut is projected on the z=norm plane.
+
+    Returns an array of projected 2d points.
+    """
+    assert Trf.ndim >= 2
+    if isinstance(Trf, np.ndarray):
+        pts = np.asarray(pts)
+    elif isinstance(Trf, torch.Tensor):
+        pts = torch.as_tensor(pts, dtype=Trf.dtype)
+
+    # adapt shape if necessary
+    output_reshape = pts.shape[:-1]
+    ncol = ncol or pts.shape[-1]
+
+    # optimized code
+    if (
+        isinstance(Trf, torch.Tensor)
+        and isinstance(pts, torch.Tensor)
+        and Trf.ndim == 3
+        and pts.ndim == 4
+    ):
+        d = pts.shape[3]
+        if Trf.shape[-1] == d:
+            pts = torch.einsum("bij, bhwj -> bhwi", Trf, pts)
+        elif Trf.shape[-1] == d + 1:
+            pts = (
+                torch.einsum("bij, bhwj -> bhwi", Trf[:, :d, :d], pts)
+                + Trf[:, None, None, :d, d]
+            )
+        else:
+            raise ValueError(f"bad shape, not ending with 3 or 4, for {pts.shape=}")
+    else:
+        if Trf.ndim >= 3:
+            n = Trf.ndim - 2
+            assert Trf.shape[:n] == pts.shape[:n], "batch size does not match"
+            Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1])
+
+            if pts.ndim > Trf.ndim:
+                # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d)
+                pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1])
+            elif pts.ndim == 2:
+                # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d)
+                pts = pts[:, None, :]
+
+        if pts.shape[-1] + 1 == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :]
+        elif pts.shape[-1] == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf
+        else:
+            pts = Trf @ pts.T
+            if pts.ndim >= 2:
+                pts = pts.swapaxes(-1, -2)
+
+    if norm:
+        pts = pts / pts[..., -1:]  # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
+        if norm != 1:
+            pts *= norm
+
+    res = pts[..., :ncol].reshape(*output_reshape, ncol)
+    return res
+
+
+def inv(mat):
+    """Invert a torch or numpy matrix"""
+    if isinstance(mat, torch.Tensor):
+        return torch.linalg.inv(mat)
+    if isinstance(mat, np.ndarray):
+        return np.linalg.inv(mat)
+    raise ValueError(f"bad matrix type = {type(mat)}")
+
+
+@cache
+def pixel_grid(H, W):
+    return np.mgrid[:W, :H].T.astype(np.float32)
+
+
+def fast_pnp(pts3d, focal, msk, device, pp=None, niter_PnP=10):
+    # extract camera poses and focals with RANSAC-PnP
+    if msk.sum() < 4:
+        return None  # we need at least 4 points for PnP
+    pts3d, msk = map(to_numpy, (pts3d, msk))
+
+    H, W, THREE = pts3d.shape
+    assert THREE == 3
+    pixels = pixel_grid(H, W)
+
+    if focal is None:
+        S = max(W, H)
+        tentative_focals = np.geomspace(S / 2, S * 3, 21)
+    else:
+        tentative_focals = [focal]
+
+    if pp is None:
+        pp = (W / 2, H / 2)
+    else:
+        pp = to_numpy(pp)
+
+    best = (0,)
+    for focal in tentative_focals:
+        K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
+
+        success, R, T, inliers = cv2.solvePnPRansac(
+            pts3d[msk],
+            pixels[msk],
+            K,
+            None,
+            iterationsCount=niter_PnP,
+            reprojectionError=5,
+            flags=cv2.SOLVEPNP_SQPNP,
+        )
+        if not success:
+            continue
+
+        score = len(inliers)
+        if success and score > best[0]:
+            best = score, R, T, focal
+
+    if not best[0]:
+        return None
+
+    _, R, T, best_focal = best
+    R = cv2.Rodrigues(R)[0]  # world to cam
+    R, T = map(torch.from_numpy, (R, T))
+    return best_focal, inv(sRT_to_4x4(1, R, T, device))  # cam to world
+
+
+def get_med_dist_between_poses(poses):
+    from scipy.spatial.distance import pdist
+
+    return np.median(pdist([to_numpy(p[:3, 3]) for p in poses]))
+
+
+def align_multiple_poses(src_poses, target_poses):
+    N = len(src_poses)
+    assert src_poses.shape == target_poses.shape == (N, 4, 4)
+
+    def center_and_z(poses):
+        eps = get_med_dist_between_poses(poses) / 100
+        return torch.cat((poses[:, :3, 3], poses[:, :3, 3] + eps * poses[:, :3, 2]))
+
+    R, T, s = roma.rigid_points_registration(
+        center_and_z(src_poses), center_and_z(target_poses), compute_scaling=True
+    )
+    return s, R, T
+
+
+def cosine_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_end + (lr_start - lr_end) * (1 + np.cos(t * np.pi)) / 2
+
+
+def linear_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_start + (lr_end - lr_start) * t
+
+
+def cycled_linear_schedule(t, lr_start, lr_end, num_cycles=2):
+    assert 0 <= t <= 1
+    cycle_t = t * num_cycles
+    cycle_t = cycle_t - int(cycle_t)
+    if t == 1:
+        cycle_t = 1
+    return linear_schedule(cycle_t, lr_start, lr_end)
+
+
+def adjust_learning_rate_by_lr(optimizer, lr):
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr
diff --git a/extern/CUT3R/config/dpt_512_vary_4_64.yaml b/extern/CUT3R/config/dpt_512_vary_4_64.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..20c64b00a9b8b30b9fa19bf3bc462c4199a58013
--- /dev/null
+++ b/extern/CUT3R/config/dpt_512_vary_4_64.yaml
@@ -0,0 +1,103 @@
+model: "ARCroco3DStereo(ARCroco3DStereoConfig(freeze='encoder', state_size=768, state_pe='2d', pos_embed='RoPE100', rgb_head=True, pose_head=True, patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='dpt', output_mode='pts3d+pose', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), pose_mode=('exp', -inf, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12, landscape_only=False))"
+pretrained: cut3r_512_dpt_4_64.pth
+load_only_encoder: False
+long_context: True
+fixed_length: False
+resume: null
+benchmark: False
+num_views : 64
+num_test_views : 4
+n_corres_train: 0
+n_corres_test: 0
+
+train_criterion: ConfLoss(Regr3DPoseBatchList(L21, norm_mode='?avg_dis'), alpha=0.2) + RGBLoss(MSE)
+test_criterion: Regr3DPose(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + Regr3DPose_ScaleInv(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + RGBLoss(L21)
+
+resolution: [(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)]
+
+allow_repeat: True
+dataset1: Co3d_Multi(allow_repeat=${allow_repeat}, split='train', ROOT='../../data/dust3r_data/processed_co3d/', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset2: WildRGBD_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/dust3r_data/processed_wildrgbd", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset3: ARKitScenes_Multi(allow_repeat=${allow_repeat}, split='train', ROOT='../../data/dust3r_data/processed_arkitscenes/', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset4: ARKitScenesHighRes_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/dust3r_data/processed_arkitscenes_highres", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset5: ScanNetpp_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/dust3r_data/processed_scannetpp/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset6: ScanNet_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/dust3r_data/processed_scannet/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset7: HyperSim_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_hypersim", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset8: BlendedMVS_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/dust3r_data/processed_blendedmvs/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset9: MegaDepth_Multi(allow_repeat=${allow_repeat}, split="train", ROOT="../../data/dust3r_data/processed_megadepth", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset10: MapFree_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/mast3r_data/processed_mapfree/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset11: Waymo_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/dust3r_data/processed_waymo/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset12: VirtualKITTI2_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/mast3r_data/processed_vkitti", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset13: UnReal4K_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/mast3r_data/processed_unreal4k/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset14: TartanAir_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/mast3r_data/processed_tartanair/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset15: DL3DV_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_dl3dv", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset16: Cop3D_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_cop3d/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset17: MVImgNet_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_mvimgnet/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset18: RE10K_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_re10k/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset19: OmniObject3D_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_omniobject3d/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset20: ThreeDKenBurns(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_3dkb/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset21: IRS(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_irs/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset22: SynScapes(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_synscapes/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset23: UrbanSyn(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_urbansyn/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset24: EDEN_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_eden", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset25: SmartPortraits_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_smartportraits", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset26: DynamicReplica(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_dynamic_replica/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset27: Spring(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_spring/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset28: BEDLAM_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_bedlam", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset29: MVS_Synth_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_mvs_synth", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset30: PointOdyssey_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_point_odyssey", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset31: UASOL_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_uasol", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset32: MP3D_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_mp3d/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+train_dataset: 44800 @ ${dataset1} + 56000 @ ${dataset2} + 56000 @ ${dataset3} + 22400 @ ${dataset4} 
+  + 16800 @ ${dataset5} + 22400 @ ${dataset6} + 11200 @ ${dataset7} 
+  + 22400 @ ${dataset8} + 22400 @ ${dataset9} + 84000 @ ${dataset10} + 56000 @ ${dataset11}
+  + 5600 @ ${dataset12} + 168 @ ${dataset13} + 56000 @ ${dataset14} + 84000 @ ${dataset15}
+  + 480 @ ${dataset16} + 19200 @ ${dataset17} + 4800 @ ${dataset18} + 38400 @ ${dataset19}
+  + 26400 @ ${dataset26} + 1200 @ ${dataset27} + 36000 @ ${dataset28} + 2400 @ ${dataset29}
+  + 24000 @ ${dataset30} + 14400 @ ${dataset31} + 28800 @ ${dataset32}
+test_dataset: 1000 @ ARKitScenes_Multi(split='test', ROOT='../../data/dust3r_data/processed_arkitscenes/', resolution=(512, 384), num_views=${num_test_views}, seed=42, n_corres=${n_corres_test})
+
+seed: 0
+batch_size: 4
+accum_iter: 4
+gradient_checkpointing: True
+epochs: 10
+start_epoch: 0
+weight_decay: 0.05
+lr: 1e-6
+min_lr: 1e-7
+warmup_epochs: 0.5
+amp: 1
+
+num_workers: 4
+world_size: 1
+local-rank: -1
+dist_url: 'env://'
+rank: 0
+gpu: 0
+distributed: False
+dist_backend: 'nccl'
+
+eval_freq: 1
+save_freq: 0.1
+keep_freq: 1
+print_freq: 10
+print_img_freq: 50000000
+num_imgs_vis: 4
+save_dir: 'checkpoints'
+exp_name: 'dpt_512_vary_4_64'
+task: 'cut3r'
+logdir: ./${save_dir}/${exp_name}/logs
+output_dir: ./${save_dir}/${exp_name}/
+hydra:
+  verbose: True
+  run:
+    dir: ./${save_dir}/${exp_name}
\ No newline at end of file
diff --git a/extern/CUT3R/config/linear_224_fixed_16.yaml b/extern/CUT3R/config/linear_224_fixed_16.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1690a287e90b165e01ed9471fd6ff34a0b353552
--- /dev/null
+++ b/extern/CUT3R/config/linear_224_fixed_16.yaml
@@ -0,0 +1,99 @@
+model: "ARCroco3DStereo(ARCroco3DStereoConfig(freeze='encoder', state_size=768, state_pe='2d', pos_embed='RoPE100', rgb_head=True, pose_head=True,  img_size=(224, 224), head_type='linear', output_mode='pts3d+pose', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), pose_mode=('exp', -inf, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12))"
+pretrained: cut3r_224_linear_4.pth
+load_only_encoder: False
+long_context: False
+fixed_length: True
+resume: null
+benchmark: True
+num_views : 16
+num_test_views : 4
+n_corres_train: 0
+n_corres_test: 0
+
+train_criterion: ConfLoss(Regr3DPoseBatchList(L21, norm_mode='?avg_dis'), alpha=0.2) + RGBLoss(MSE)
+test_criterion: Regr3DPose(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + Regr3DPose_ScaleInv(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + RGBLoss(L21)
+
+
+dataset1: Co3d_Multi(allow_repeat=False, split='train', ROOT='../../data/dust3r_data/processed_co3d/', aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset2: WildRGBD_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_wildrgbd", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset3: ARKitScenes_Multi(allow_repeat=False, split='train', ROOT='../../data/dust3r_data/processed_arkitscenes/', aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset4: ARKitScenesHighRes_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_arkitscenes_highres", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset5: ScanNetpp_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_scannetpp/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset6: ScanNet_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_scannet/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset7: HyperSim_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_hypersim", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset8: BlendedMVS_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_blendedmvs/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset9: MegaDepth_Multi(allow_repeat=False, split="train", ROOT="../../data/dust3r_data/processed_megadepth", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset10: MapFree_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_mapfree/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset11: Waymo_Multi(allow_repeat=False, split=None, ROOT="../../data/dust3r_data/processed_waymo/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset12: VirtualKITTI2_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_vkitti", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset13: UnReal4K_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_unreal4k/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset14: TartanAir_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_tartanair/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset15: DL3DV_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_dl3dv", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset16: Cop3D_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_cop3d/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset17: MVImgNet_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_mvimgnet/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset18: RE10K_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_re10k/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset19: OmniObject3D_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_omniobject3d/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset20: ThreeDKenBurns(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_3dkb/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset21: IRS(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_irs/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset22: SynScapes(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_synscapes/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset23: UrbanSyn(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_urbansyn/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset24: EDEN_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_eden", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset25: SmartPortraits_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_smartportraits", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset26: DynamicReplica(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_dynamic_replica/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset27: Spring(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_spring/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset28: BEDLAM_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_bedlam", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset29: MVS_Synth_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_mvs_synth", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset30: PointOdyssey_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_point_odyssey", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset31: UASOL_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_uasol", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset32: MP3D_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_mp3d/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset33: HOI4D_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_hoi4d/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+train_dataset: 44800 @ ${dataset1} + 56000 @ ${dataset2} + 56000 @ ${dataset3} + 5600 @ ${dataset4} + 5600 @ ${dataset5} + 140000 @ ${dataset6} + 5600 @ ${dataset7} + 22400 @ ${dataset8} + 16800 @ ${dataset9} + 56000 @ ${dataset10}  + 42000 @ ${dataset11} + 5600 @ ${dataset12} + 168 @ ${dataset13} + 84000 @ ${dataset14} + 84000 @ ${dataset15} + 7200 @ ${dataset16} + 19200 @ ${dataset17} + 9600 @ ${dataset18} + 24000 @ ${dataset19} + 33600 @ ${dataset26} + 2400 @ ${dataset27} + 9600 @ ${dataset28} + 4800 @ ${dataset29} + 28800 @ ${dataset30} + 14400 @ ${dataset31} + 19200 @ ${dataset32}
+
+
+test_dataset: 1000 @ ARKitScenes_Multi(split='test', ROOT='../../data/dust3r_data/processed_arkitscenes/', resolution=224, num_views=${num_test_views}, seed=42, n_corres=${n_corres_test})
+
+seed: 0
+batch_size: 6
+accum_iter: 2
+gradient_checkpointing: False
+epochs: 10
+start_epoch: 0
+weight_decay: 0.05
+lr: 1e-6
+min_lr: 1e-7
+warmup_epochs: 0.5
+amp: 1
+
+num_workers: 16
+world_size: 1
+local-rank: -1
+dist_url: 'env://'
+rank: 0
+gpu: 0
+distributed: False
+dist_backend: 'nccl'
+
+eval_freq: 1
+save_freq: 0.1
+keep_freq: 1
+print_freq: 10
+print_img_freq: 50000000
+num_imgs_vis: 4
+save_dir: 'checkpoints'
+exp_name: 'linear_224_fixed_16'
+task: 'cut3r'
+logdir: ./${save_dir}/${exp_name}/logs
+output_dir: ./${save_dir}/${exp_name}/
+hydra:
+  verbose: True
+  run:
+    dir: ./${save_dir}/${exp_name}
\ No newline at end of file
diff --git a/extern/CUT3R/config/stage1.yaml b/extern/CUT3R/config/stage1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e4897e7f26b7576fd779bdaac8d2f68512de2e9
--- /dev/null
+++ b/extern/CUT3R/config/stage1.yaml
@@ -0,0 +1,74 @@
+model: "ARCroco3DStereo(ARCroco3DStereoConfig(state_size=768, state_pe='2d', pos_embed='RoPE100', rgb_head=True, pose_head=True,  img_size=(224, 224), head_type='linear', output_mode='pts3d+pose', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), pose_mode=('exp', -inf, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12))"
+pretrained: null
+load_only_encoder: False
+long_context: False
+fixed_length: True
+resume: null
+benchmark: True
+num_views : 4
+num_test_views : 4
+n_corres_train: 0
+n_corres_test: 0
+
+train_criterion: ConfLoss(Regr3DPose(L21, norm_mode='?avg_dis'), alpha=0.2) + RGBLoss(MSE)
+test_criterion: Regr3DPose(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + Regr3DPose_ScaleInv(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + RGBLoss(L21)
+
+dataset1: Co3d_Multi(split='train', ROOT='../../data/dust3r_data/processed_co3d/', aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset2: WildRGBD_Multi(split='train', ROOT="../../data/dust3r_data/processed_wildrgbd", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset3: ARKitScenes_Multi(split='train', ROOT='../../data/dust3r_data/processed_arkitscenes/', aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset4: ARKitScenesHighRes_Multi(split='train', ROOT="../../data/dust3r_data/processed_arkitscenes_highres", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset5: ScanNetpp_Multi(split='train', ROOT="../../data/dust3r_data/processed_scannetpp/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset6: ScanNet_Multi(split='train', ROOT="../../data/dust3r_data/processed_scannet/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset7: HyperSim_Multi(split='train', ROOT="../../data/custom_data/processed_hypersim", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset8: BlendedMVS_Multi(split='train', ROOT="../../data/dust3r_data/processed_blendedmvs/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset9: MegaDepth_Multi(split="train", ROOT="../../data/dust3r_data/processed_megadepth", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset10: MapFree_Multi(split=None, ROOT="../../data/mast3r_data/processed_mapfree/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset11: Waymo_Multi(split=None, ROOT="../../data/dust3r_data/processed_waymo/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset12: VirtualKITTI2_Multi(split=None, ROOT="../../data/mast3r_data/processed_vkitti", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset13: UnReal4K_Multi(split=None, ROOT="../../data/mast3r_data/processed_unreal4k/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset14: TartanAir_Multi(split=None, ROOT="../../data/mast3r_data/processed_tartanair/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+dataset15: DL3DV_Multi(split='train', ROOT="../../data/custom_data/processed_dl3dv", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+
+train_dataset: 32000 @ ${dataset1} + 48000 @ ${dataset2} + 100800 @ ${dataset3} + 56000 @ ${dataset4} + 33600 @ ${dataset5} + 56000 @ ${dataset6} + 33600 @ ${dataset7} + 33600 @ ${dataset8} + 33600 @ ${dataset9} + 100800 @ ${dataset10} + 78400 @ ${dataset11} + 5000 @ ${dataset12} + 1000 @ ${dataset13} + 33600 @ ${dataset14} + 160000 @ ${dataset15}
+test_dataset: 1000 @ ARKitScenes_Multi(split='test', ROOT='../../data/dust3r_data/processed_arkitscenes/', resolution=224, num_views=4, seed=42, n_corres=0)
+
+
+seed: 0
+batch_size: 16
+accum_iter: 1
+gradient_checkpointing: False
+epochs: 100
+start_epoch: 0
+weight_decay: 0.05
+lr: 1e-4
+min_lr: 1e-6
+warmup_epochs: 10
+amp: 1
+
+num_workers: 8
+world_size: 1
+local-rank: -1
+dist_url: 'env://'
+rank: 0
+gpu: 0
+distributed: False
+dist_backend: 'nccl'
+
+eval_freq: 1
+save_freq: 1
+keep_freq: 10
+print_freq: 10
+print_img_freq: 500
+num_imgs_vis: 4
+save_dir: 'checkpoints'
+exp_name: 'train_first_stage'
+task: 'cut3r'
+logdir: ./${save_dir}/${exp_name}/logs
+output_dir: ./${save_dir}/${exp_name}/
+hydra:
+  verbose: True
+  run:
+    dir: ./${save_dir}/${exp_name}
\ No newline at end of file
diff --git a/extern/CUT3R/config/stage2.yaml b/extern/CUT3R/config/stage2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..10fec1d29a33bbf7484d537076e334aaf1c94fd7
--- /dev/null
+++ b/extern/CUT3R/config/stage2.yaml
@@ -0,0 +1,132 @@
+model: ARCroco3DStereo(ARCroco3DStereoConfig(state_size=768, state_pe='2d', pos_embed='RoPE100',
+  rgb_head=True, pose_head=True,  img_size=(224, 224), head_type='linear', output_mode='pts3d+pose',
+  depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), pose_mode=('exp', -inf,
+  inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12,
+  dec_num_heads=12))
+pretrained: checkpoints/train_first_stage/checkpoint-final.pth
+load_only_encoder: False
+long_context: False
+fixed_length: True
+resume: null
+benchmark: True
+num_views : 4
+num_test_views : 4
+n_corres_train: 0
+n_corres_test: 0
+
+
+train_criterion: ConfLoss(Regr3DPoseBatchList(L21, norm_mode='?avg_dis'), alpha=0.2) + RGBLoss(MSE)
+test_criterion: Regr3DPose(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + Regr3DPose_ScaleInv(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + RGBLoss(L21)
+
+
+dataset1: Co3d_Multi(split='train', ROOT='../../data/dust3r_data/processed_co3d/',
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset2: WildRGBD_Multi(split='train', ROOT="../../data/dust3r_data/processed_wildrgbd",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset3: ARKitScenes_Multi(split='train', ROOT='../../data/dust3r_data/processed_arkitscenes/',
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset4: ARKitScenesHighRes_Multi(split='train', ROOT="../../data/dust3r_data/processed_arkitscenes_highres",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset5: ScanNetpp_Multi(split='train', ROOT="../../data/dust3r_data/processed_scannetpp/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset6: ScanNet_Multi(split='train', ROOT="../../data/dust3r_data/processed_scannet/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset7: HyperSim_Multi(split='train', ROOT="../../data/custom_data/processed_hypersim",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset8: BlendedMVS_Multi(split='train', ROOT="../../data/dust3r_data/processed_blendedmvs/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset9: MegaDepth_Multi(split="train", ROOT="../../data/dust3r_data/processed_megadepth",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset10: MapFree_Multi(split=None, ROOT="../../data/mast3r_data/processed_mapfree/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset11: Waymo_Multi(split=None, ROOT="../../data/dust3r_data/processed_waymo/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset12: VirtualKITTI2_Multi(split=None, ROOT="../../data/mast3r_data/processed_vkitti",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset13: UnReal4K_Multi(split=None, ROOT="../../data/mast3r_data/processed_unreal4k/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset14: TartanAir_Multi(split=None, ROOT="../../data/mast3r_data/processed_tartanair/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset15: DL3DV_Multi(split='train', ROOT="../../data/custom_data/processed_dl3dv",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset16: Cop3D_Multi(split='train', ROOT="../../data/custom_data/processed_cop3d/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset17: MVImgNet_Multi(split='train', ROOT="../../data/custom_data/processed_mvimgnet/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset18: RE10K_Multi(split=None, ROOT="../../data/custom_data/processed_re10k/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset19: OmniObject3D_Multi(split=None, ROOT="../../data/custom_data/processed_omniobject3d/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset20: ThreeDKenBurns(split=None, ROOT="../../data/custom_data/processed_3dkb/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset21: IRS(split=None, ROOT="../../data/custom_data/processed_irs/", aug_crop=16,
+  resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset22: SynScapes(split=None, ROOT="../../data/custom_data/processed_synscapes/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset23: UrbanSyn(split=None, ROOT="../../data/custom_data/processed_urbansyn/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset24: EDEN_Multi(split='train', ROOT="../../data/custom_data/processed_eden",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset25: SmartPortraits_Multi(split='train', ROOT="../../data/custom_data/processed_smartportraits",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset26: DynamicReplica(split='train', ROOT="../../data/custom_data/processed_dynamic_replica/",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset27: Spring(split=None, ROOT="../../data/custom_data/processed_spring/", aug_crop=16,
+  resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset28: BEDLAM_Multi(split='train', ROOT="../../data/custom_data/processed_bedlam",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset29: MVS_Synth_Multi(split='train', ROOT="../../data/custom_data/processed_mvs_synth",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset30: PointOdyssey_Multi(split='train', ROOT="../../data/custom_data/processed_point_odyssey",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset31: UASOL_Multi(split='train', ROOT="../../data/custom_data/processed_uasol",
+  aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset32: MP3D_Multi(split=None, ROOT="../../data/custom_data/processed_mp3d/", aug_crop=16,
+  resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+train_dataset: 48000 @ ${dataset1} + 60000 @ ${dataset2} + 54000 @ ${dataset3} + 18000
+  @ ${dataset4} + 6000 @ ${dataset5} + 42000 @ ${dataset6} + 12000 @ ${dataset7} +
+  6000 @ ${dataset8} + 6000 @ ${dataset9} + 60000 @ ${dataset10} + 48000 @ ${dataset11}
+  + 2400 @ ${dataset12} + 180 @ ${dataset13} + 18000 @ ${dataset14} + 222000 @ ${dataset15}
+  + 400 @ ${dataset16} + 16000 @ ${dataset17} + 4000 @ ${dataset18} + 32000 @ ${dataset19}
+  + 4000 @ ${dataset20} + 2000 @ ${dataset21} + 2000 @ ${dataset22} + 500 @ ${dataset23}
+  + 12000 @ ${dataset24} + 16000 @ ${dataset25} + 20000 @ ${dataset26} + 400 @ ${dataset27}
+  + 32000 @ ${dataset28} + 2000 @ ${dataset29} + 20000 @ ${dataset30} + 12000 @ ${dataset31}
+  + 24000 @ ${dataset32}
+test_dataset: 1000 @ ARKitScenes_Multi(split='test', ROOT='../../data/dust3r_data/processed_arkitscenes/', resolution=224, num_views=4, seed=42, n_corres=0)
+
+seed: 0
+batch_size: 16
+accum_iter: 1
+gradient_checkpointing: false
+epochs: 35
+start_epoch: 0
+weight_decay: 0.05
+lr: 5.0e-06
+min_lr: 1.0e-06
+warmup_epochs: 1
+amp: 1
+
+num_workers: 8
+world_size: 1
+local-rank: -1
+dist_url: 'env://'
+rank: 0
+gpu: 0
+distributed: False
+dist_backend: 'nccl'
+
+eval_freq: 1
+save_freq: 1
+keep_freq: 10
+print_freq: 10
+print_img_freq: 500
+num_imgs_vis: 4
+save_dir: 'checkpoints'
+exp_name: 'train_second_stage'
+task: 'cut3r'
+logdir: ./${save_dir}/${exp_name}/logs
+output_dir: ./${save_dir}/${exp_name}/
+hydra:
+  verbose: True
+  run:
+    dir: ./${save_dir}/${exp_name}
\ No newline at end of file
diff --git a/extern/CUT3R/config/stage3.yaml b/extern/CUT3R/config/stage3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..88cafa0e1a6afe2761e1b81742876ed2cbd77068
--- /dev/null
+++ b/extern/CUT3R/config/stage3.yaml
@@ -0,0 +1,219 @@
+model: ARCroco3DStereo(ARCroco3DStereoConfig(state_size=768, state_pe='2d', pos_embed='RoPE100',
+  rgb_head=True, pose_head=True, patch_embed_cls='ManyAR_PatchEmbed', img_size=(512,
+  512), head_type='dpt', output_mode='pts3d+pose', depth_mode=('exp', -inf, inf),
+  conf_mode=('exp', 1, inf), pose_mode=('exp', -inf, inf), enc_embed_dim=1024, enc_depth=24,
+  enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12, landscape_only=False))
+pretrained: checkpoints/train_second_stage/checkpoint-final.pth
+load_only_encoder: False
+long_context: False
+fixed_length: True
+resume: null
+benchmark: True
+num_views : 4
+num_test_views : 4
+n_corres_train: 0
+n_corres_test: 0
+
+
+train_criterion: ConfLoss(Regr3DPoseBatchList(L21, norm_mode='?avg_dis'), alpha=0.2) + RGBLoss(MSE)
+test_criterion: Regr3DPose(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + Regr3DPose_ScaleInv(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + RGBLoss(L21)
+
+resolution:
+- (512
+- 384)
+- (512
+- 336)
+- (512
+- 288)
+- (512
+- 256)
+- (512
+- 208)
+- (512
+- 144)
+- (384
+- 512)
+- (336
+- 512)
+- (288
+- 512)
+- (256
+- 512)
+dataset1: Co3d_Multi(allow_repeat=True, split='train', ROOT='../../data/dust3r_data/processed_co3d/',
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset2: WildRGBD_Multi(allow_repeat=True, split='train', ROOT="../../data/dust3r_data/processed_wildrgbd",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset3: ARKitScenes_Multi(allow_repeat=True, split='train', ROOT='../../data/dust3r_data/processed_arkitscenes/',
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset4: ARKitScenesHighRes_Multi(allow_repeat=True, split='train', ROOT="../../data/dust3r_data/processed_arkitscenes_highres",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset5: ScanNetpp_Multi(allow_repeat=True, split='train', ROOT="../../data/dust3r_data/processed_scannetpp/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset6: ScanNet_Multi(allow_repeat=True, split='train', ROOT="../../data/dust3r_data/processed_scannet/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset7: HyperSim_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_hypersim",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset8: BlendedMVS_Multi(allow_repeat=True, split='train', ROOT="../../data/dust3r_data/processed_blendedmvs/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset9: MegaDepth_Multi(allow_repeat=True, split="train", ROOT="../../data/dust3r_data/processed_megadepth",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset10: MapFree_Multi(allow_repeat=True, split=None, ROOT="../../data/mast3r_data/processed_mapfree/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset11: Waymo_Multi(allow_repeat=True, split=None, ROOT="../../data/dust3r_data/processed_waymo/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset12: VirtualKITTI2_Multi(allow_repeat=True, split=None, ROOT="../../data/mast3r_data/processed_vkitti",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset13: UnReal4K_Multi(allow_repeat=True, split=None, ROOT="../../data/mast3r_data/processed_unreal4k/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset14: TartanAir_Multi(allow_repeat=True, split=None, ROOT="../../data/mast3r_data/processed_tartanair/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset15: DL3DV_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_dl3dv",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset16: Cop3D_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_cop3d/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset17: MVImgNet_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_mvimgnet/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset18: RE10K_Multi(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_re10k/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset19: OmniObject3D_Multi(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_omniobject3d/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset20: ThreeDKenBurns(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_3dkb/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset21: IRS(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_irs/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset22: SynScapes(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_synscapes/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset23: UrbanSyn(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_urbansyn/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset24: EDEN_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_eden",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset25: SmartPortraits_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_smartportraits",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset26: DynamicReplica(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_dynamic_replica/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset27: Spring(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_spring/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset28: BEDLAM_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_bedlam",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset29: MVS_Synth_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_mvs_synth",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset30: PointOdyssey_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_point_odyssey",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset31: UASOL_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_uasol",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset32: MP3D_Multi(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_mp3d/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset33: HOI4D_Multi(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_hoi4d/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+train_dataset: 44800 @ ${dataset1} + 56000 @ ${dataset2} + 56000 @ ${dataset3} + 22400
+  @ ${dataset4} + 16800 @ ${dataset5} + 38400 @ ${dataset6} + 11200 @ ${dataset7}
+  + 22400 @ ${dataset8} + 22400 @ ${dataset9} + 84000 @ ${dataset10}  + 20000 @ ${dataset11}
+  + 5600 @ ${dataset12} + 168 @ ${dataset13} + 56000 @ ${dataset14} + 74000 @ ${dataset15}
+  + 480 @ ${dataset16} + 19200 @ ${dataset17} + 4800 @ ${dataset18} + 4800 @ ${dataset20}
+  + 2400 @ ${dataset21} + 2400 @ ${dataset22} + 600 @ ${dataset23} + 19200 @ ${dataset25}
+  + 36000 @ ${dataset26} + 9400 @ ${dataset27} + 36000 @ ${dataset28} + 1400 @ ${dataset29}
+  + 7200 @ ${dataset30} + 14400 @ ${dataset31} + 28800 @ ${dataset32} + 12000 @ ${dataset33}
+test_dataset: 1000 @ ARKitScenes_Multi(split='test', ROOT='../../data/dust3r_data/processed_arkitscenes/', resolution=224, num_views=4, seed=42, n_corres=0)
+
+seed: 0
+batch_size: 16
+accum_iter: 1
+gradient_checkpointing: true
+epochs: 40
+start_epoch: 0
+weight_decay: 0.05
+lr: 1.0e-05
+min_lr: 1.0e-06
+warmup_epochs: 2
+amp: 1
+
+num_workers: 8
+world_size: 1
+local-rank: -1
+dist_url: 'env://'
+rank: 0
+gpu: 0
+distributed: False
+dist_backend: 'nccl'
+
+eval_freq: 1
+save_freq: 1
+keep_freq: 10
+print_freq: 10
+print_img_freq: 500
+num_imgs_vis: 4
+save_dir: 'checkpoints'
+exp_name: 'train_third_stage'
+task: 'cut3r'
+logdir: ./${save_dir}/${exp_name}/logs
+output_dir: ./${save_dir}/${exp_name}/
+hydra:
+  verbose: True
+  run:
+    dir: ./${save_dir}/${exp_name}
\ No newline at end of file
diff --git a/extern/CUT3R/config/stage4.yaml b/extern/CUT3R/config/stage4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3cdebc13d3998e833b0bacd28d7d001b2c1950ae
--- /dev/null
+++ b/extern/CUT3R/config/stage4.yaml
@@ -0,0 +1,219 @@
+model: ARCroco3DStereo(ARCroco3DStereoConfig(freeze='encoder', state_size=768, state_pe='2d',
+  pos_embed='RoPE100', rgb_head=True, pose_head=True, patch_embed_cls='ManyAR_PatchEmbed',
+  img_size=(512, 512), head_type='dpt', output_mode='pts3d+pose', depth_mode=('exp',
+  -inf, inf), conf_mode=('exp', 1, inf), pose_mode=('exp', -inf, inf), enc_embed_dim=1024,
+  enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12,
+  landscape_only=False))
+pretrained: checkpoints/train_third_stage/checkpoint-final.pth
+load_only_encoder: False
+long_context: True
+fixed_length: True
+resume: null
+benchmark: True
+num_views : 32
+num_test_views : 4
+n_corres_train: 0
+n_corres_test: 0
+
+train_criterion: ConfLoss(Regr3DPose(L21, norm_mode='?avg_dis'), alpha=0.2) + RGBLoss(MSE)
+test_criterion: Regr3DPose(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0)
+  + Regr3DPose_ScaleInv(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0)
+  + RGBLoss(L21)
+resolution:
+- (512
+- 384)
+- (512
+- 336)
+- (512
+- 288)
+- (512
+- 256)
+- (512
+- 208)
+- (512
+- 144)
+- (384
+- 512)
+- (336
+- 512)
+- (288
+- 512)
+- (256
+- 512)
+dataset1: Co3d_Multi(allow_repeat=False, split='train', ROOT='../../data/dust3r_data/processed_co3d/',
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset2: WildRGBD_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_wildrgbd",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset3: ARKitScenes_Multi(allow_repeat=False, split='train', ROOT='../../data/dust3r_data/processed_arkitscenes/',
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset4: ARKitScenesHighRes_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_arkitscenes_highres",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset5: ScanNetpp_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_scannetpp/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset6: ScanNet_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_scannet/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset7: HyperSim_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_hypersim",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset8: BlendedMVS_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_blendedmvs/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset9: MegaDepth_Multi(allow_repeat=False, split="train", ROOT="../../data/dust3r_data/processed_megadepth",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset10: MapFree_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_mapfree/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset11: Waymo_Multi(allow_repeat=False, split=None, ROOT="../../data/dust3r_data/processed_waymo/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset12: VirtualKITTI2_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_vkitti",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset13: UnReal4K_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_unreal4k/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset14: TartanAir_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_tartanair/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset15: DL3DV_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_dl3dv",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset16: Cop3D_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_cop3d/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset17: MVImgNet_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_mvimgnet/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset18: RE10K_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_re10k/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset19: OmniObject3D_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_omniobject3d/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset20: ThreeDKenBurns(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_3dkb/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset21: IRS(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_irs/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset22: SynScapes(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_synscapes/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset23: UrbanSyn(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_urbansyn/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset24: EDEN_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_eden",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset25: SmartPortraits_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_smartportraits",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset26: DynamicReplica(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_dynamic_replica/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset27: Spring(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_spring/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset28: BEDLAM_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_bedlam",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset29: MVS_Synth_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_mvs_synth",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset30: PointOdyssey_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_point_odyssey",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset31: UASOL_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_uasol",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset32: MP3D_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_mp3d/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset33: HOI4D_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_hoi4d/",
+  aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
+  (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+train_dataset: 22400 @ ${dataset1} + 28000 @ ${dataset2} + 28000 @ ${dataset3} + 2800
+  @ ${dataset4} + 2800 @ ${dataset5} + 70000 @ ${dataset6} + 2800 @ ${dataset7} +
+  11200 @ ${dataset8} + 8400 @ ${dataset9} + 28000 @ ${dataset10} + 21000 @ ${dataset11}
+  + 2800 @ ${dataset12} + 84 @ ${dataset13} + 42000 @ ${dataset14} + 42000 @ ${dataset15}
+  + 3600 @ ${dataset16} + 9600 @ ${dataset17} + 4800 @ ${dataset18} + 12000 @ ${dataset19}
+  + 16800 @ ${dataset26} + 1200 @ ${dataset27} + 4800 @ ${dataset28} + 2400 @ ${dataset29}
+  + 14400 @ ${dataset30} + 7200 @ ${dataset31} + 9600 @ ${dataset32}
+test_dataset: 1000 @ ARKitScenes_Multi(split='test', ROOT='../../data/dust3r_data/processed_arkitscenes/', resolution=224, num_views=4, seed=42, n_corres=0)
+
+seed: 0
+batch_size: 16
+accum_iter: 1
+gradient_checkpointing: true
+epochs: 10
+start_epoch: 0
+weight_decay: 0.05
+lr: 1.0e-06
+min_lr: 1.0e-07
+warmup_epochs: 0.5
+amp: 1
+
+num_workers: 8
+world_size: 1
+local-rank: -1
+dist_url: 'env://'
+rank: 0
+gpu: 0
+distributed: False
+dist_backend: 'nccl'
+
+eval_freq: 1
+save_freq: 1
+keep_freq: 10
+print_freq: 10
+print_img_freq: 500
+num_imgs_vis: 4
+save_dir: 'checkpoints'
+exp_name: 'train_final_stage'
+task: 'cut3r'
+logdir: ./${save_dir}/${exp_name}/logs
+output_dir: ./${save_dir}/${exp_name}/
+hydra:
+  verbose: True
+  run:
+    dir: ./${save_dir}/${exp_name}
\ No newline at end of file
diff --git a/extern/CUT3R/datasets_preprocess/custom_convert2TUM.py b/extern/CUT3R/datasets_preprocess/custom_convert2TUM.py
new file mode 100644
index 0000000000000000000000000000000000000000..afea80e75ede4a25254b45d4e1077333cef9ca3e
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/custom_convert2TUM.py
@@ -0,0 +1,262 @@
+import os
+import json
+import shutil
+import numpy as np
+import cv2 as cv
+import imageio
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import open3d as o3d
+import scipy.ndimage
+import pickle
+
+# Set environment variable to limit OpenBLAS threads
+os.environ["OPENBLAS_NUM_THREADS"] = "1"
+
+DEPTH_SCALE_FACTOR = 5000
+
+
+# Point cloud from depth
+def pointcloudify_depth(depth, intrinsics, dist_coeff, undistort=True):
+    shape = depth.shape[::-1]
+
+    if undistort:
+        undist_intrinsics, _ = cv.getOptimalNewCameraMatrix(
+            intrinsics, dist_coeff, shape, 1, shape
+        )
+        inv_undist_intrinsics = np.linalg.inv(undist_intrinsics)
+
+        map_x, map_y = cv.initUndistortRectifyMap(
+            intrinsics, dist_coeff, None, undist_intrinsics, shape, cv.CV_32FC1
+        )
+        undist_depth = cv.remap(depth, map_x, map_y, cv.INTER_NEAREST)
+    else:
+        inv_undist_intrinsics = np.linalg.inv(intrinsics)
+        undist_depth = depth
+
+    # Generate x,y grid for H x W image
+    grid_x, grid_y = np.meshgrid(np.arange(shape[0]), np.arange(shape[1]))
+    grid = np.stack((grid_x, grid_y, np.ones_like(grid_x)), axis=-1)
+
+    # Reshape and compute local grid
+    grid_flat = grid.reshape(-1, 3).T
+    local_grid = inv_undist_intrinsics @ grid_flat
+
+    # Multiply by depth
+    local_grid = local_grid.T * undist_depth.reshape(-1, 1)
+
+    return local_grid.astype(np.float32)
+
+
+def project_pcd_to_depth(pcd, undist_intrinsics, img_size, config):
+    h, w = img_size
+    points = np.asarray(pcd.points)
+    d = points[:, 2]
+    normalized_points = points / points[:, 2][:, np.newaxis]
+    proj_pcd = np.round((undist_intrinsics @ normalized_points.T).T).astype(np.int64)
+    proj_mask = (
+        (proj_pcd[:, 0] >= 0)
+        & (proj_pcd[:, 0] < w)
+        & (proj_pcd[:, 1] >= 0)
+        & (proj_pcd[:, 1] < h)
+    )
+    proj_pcd = proj_pcd[proj_mask]
+    d = d[proj_mask]
+    pcd_image = np.zeros((config["res_h"], config["res_w"]), dtype=np.float32)
+    pcd_image[proj_pcd[:, 1], proj_pcd[:, 0]] = d
+    return pcd_image
+
+
+def smooth_depth(depth):
+    MAX_DEPTH_VAL = 1e5
+    KERNEL_SIZE = 11
+    depth = depth.copy()
+    depth[depth == 0] = MAX_DEPTH_VAL
+    smoothed_depth = scipy.ndimage.minimum_filter(depth, KERNEL_SIZE)
+    smoothed_depth[smoothed_depth == MAX_DEPTH_VAL] = 0
+    return smoothed_depth
+
+
+def align_rgb_depth(rgb, depth, roi, config, rgb_cnf, config_dict, T):
+    # Undistort rgb image
+    undist_rgb = cv.undistort(
+        rgb,
+        rgb_cnf["intrinsics"],
+        rgb_cnf["dist_coeff"],
+        None,
+        rgb_cnf["undist_intrinsics"],
+    )
+
+    # Create point cloud from depth
+    pcd = o3d.geometry.PointCloud()
+    points = pointcloudify_depth(
+        depth, config_dict["depth"]["dist_mtx"], config_dict["depth"]["dist_coef"]
+    )
+    pcd.points = o3d.utility.Vector3dVector(points)
+    # Align point cloud with depth reference frame
+    pcd.transform(T)
+
+    # Project aligned point cloud to rgb
+    aligned_depth = project_pcd_to_depth(
+        pcd, rgb_cnf["undist_intrinsics"], rgb.shape[:2], config
+    )
+
+    smoothed_aligned_depth = smooth_depth(aligned_depth)
+    x, y, w, h = roi
+
+    depth_res = smoothed_aligned_depth[y : y + h, x : x + w]
+    rgb_res = undist_rgb[y : y + h, x : x + w]
+    return rgb_res, depth_res, rgb_cnf["undist_intrinsics"]
+
+
+def process_pair(args):
+    (
+        pair,
+        smartphone_folder,
+        azure_depth_folder,
+        final_folder,
+        config,
+        rgb_cnf,
+        config_dict,
+        T,
+    ) = args
+    try:
+        rgb_image = cv.imread(os.path.join(smartphone_folder, f"{pair[0]}.png"))
+        depth_array = np.load(
+            os.path.join(azure_depth_folder, f"{pair[1]}.npy"), allow_pickle=True
+        )
+
+        rgb_image_aligned, depth_array_aligned, intrinsics = align_rgb_depth(
+            rgb_image,
+            depth_array,
+            (0, 0, config["res_w"], config["res_h"]),
+            config,
+            rgb_cnf,
+            config_dict,
+            T,
+        )
+        # Save rgb as 8-bit png
+        cv.imwrite(
+            os.path.join(final_folder, "rgb", f"{pair[0]}.png"), rgb_image_aligned
+        )
+
+        # # Save depth as 16-bit unsigned int with scale factor
+        # depth_array_aligned = (depth_array_aligned *
+        #                        DEPTH_SCALE_FACTOR).astype(np.uint16)
+        # imageio.imwrite(os.path.join(final_folder, 'depth', f"{pair[1]}.png"), depth_array_aligned)
+        np.save(
+            os.path.join(final_folder, "depth", f"{pair[0]}.npy"), depth_array_aligned
+        )
+        np.savez(
+            os.path.join(final_folder, "cam", f"{pair[0]}.npz"), intrinsics=intrinsics
+        )
+    except Exception as e:
+        return f"Error processing pair {pair}: {e}"
+    return None
+
+
+def main():
+    DATA_DIR_ = "data_smartportraits/SmartPortraits"  # REPLACE WITH YOUR OWN DATA PATH!
+    DATA_DIR = DATA_DIR_.rstrip("/")
+    print(f"{DATA_DIR_} {DATA_DIR}/")
+
+    # Folder where the data in TUM format will be put
+    curr_dir = os.path.dirname(os.path.abspath(__file__))
+    with open(os.path.join(curr_dir, "config.json")) as conf_f:
+        config = json.load(conf_f)
+
+    # Pre-load shared data
+    with open(os.path.join(curr_dir, config["depth_conf"]), "rb") as config_f:
+        config_dict = pickle.load(config_f)
+
+    rgb_cnf = np.load(
+        os.path.join(curr_dir, config["rgb_intristics"]), allow_pickle=True
+    ).item()
+
+    T = np.load(os.path.join(curr_dir, config["transform_intristics"]))
+
+    final_root = "processed_smartportraits1"  # REPLACE WITH YOUR OWN DATA PATH!
+
+    seqs = []
+    for scene in os.listdir(DATA_DIR):
+        scene_path = os.path.join(DATA_DIR, scene)
+        if not os.path.isdir(scene_path):
+            continue
+        for s in os.listdir(scene_path):
+            s_path = os.path.join(scene_path, s)
+            if not os.path.isdir(s_path):
+                continue
+            for date in os.listdir(s_path):
+                date_path = os.path.join(s_path, date)
+                if os.path.isdir(date_path):
+                    seqs.append((scene, s, date))
+
+    for seq in tqdm(seqs):
+        scene, s, date = seq
+        dataset_path = os.path.join(DATA_DIR, scene, s, date)
+        final_folder = os.path.join(final_root, "_".join([scene, s, date]))
+
+        azure_depth_folder = os.path.join(dataset_path, "_azure_depth_image_raw")
+        smartphone_folder = os.path.join(dataset_path, "smartphone_video_frames")
+
+        depth_files = [
+            file for file in os.listdir(azure_depth_folder) if file.endswith(".npy")
+        ]
+        depth_ts = np.array([int(file.split(".")[0]) for file in depth_files])
+        depth_ts.sort()
+
+        rgb_files = [
+            file for file in os.listdir(smartphone_folder) if file.endswith(".png")
+        ]
+        rgb_ts = np.array([int(file.split(".")[0]) for file in rgb_files])
+        rgb_ts.sort()
+
+        print(
+            f"Depth timestamps from {depth_ts[0]} to {depth_ts[-1]} (cnt {len(depth_ts)})"
+        )
+        print(f"RGB timestamps from {rgb_ts[0]} to {rgb_ts[-1]} (cnt {len(rgb_ts)})")
+
+        # Build correspondences between depth and rgb by nearest neighbour algorithm
+        rgbd_pairs = []
+        for depth_t in depth_ts:
+            idx = np.argmin(np.abs(rgb_ts - depth_t))
+            closest_rgb_t = rgb_ts[idx]
+            rgbd_pairs.append((closest_rgb_t, depth_t))
+
+        # Prepare folder infrastructure
+        if os.path.exists(final_folder):
+            shutil.rmtree(final_folder)
+        os.makedirs(os.path.join(final_folder, "depth"), exist_ok=True)
+        os.makedirs(os.path.join(final_folder, "rgb"), exist_ok=True)
+        os.makedirs(os.path.join(final_folder, "cam"), exist_ok=True)
+
+        # Prepare arguments for processing
+        tasks = [
+            (
+                pair,
+                smartphone_folder,
+                azure_depth_folder,
+                final_folder,
+                config,
+                rgb_cnf,
+                config_dict,
+                T,
+            )
+            for pair in rgbd_pairs
+        ]
+
+        num_workers = os.cpu_count()
+        with ProcessPoolExecutor(max_workers=num_workers) as executor:
+            futures = {executor.submit(process_pair, task): task[0] for task in tasks}
+            for future in tqdm(
+                as_completed(futures),
+                total=len(futures),
+                desc=f"Processing {scene}_{s}_{date}",
+            ):
+                error = future.result()
+                if error:
+                    print(error)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/flow_IO.py b/extern/CUT3R/datasets_preprocess/flow_IO.py
new file mode 100644
index 0000000000000000000000000000000000000000..1979b245f410be4ce31fcb69cc87b5a55c2c4b49
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/flow_IO.py
@@ -0,0 +1,476 @@
+import struct
+import numpy as np
+import png
+import re
+import sys
+import csv
+from PIL import Image
+import h5py
+
+
+FLO_TAG_FLOAT = (
+    202021.25  # first 4 bytes in flo file; check for this when READING the file
+)
+FLO_TAG_STRING = "PIEH"  # first 4 bytes in flo file; use this when WRITING the file
+FLO_UNKNOWN_FLOW_THRESH = 1e9  # flo format threshold for unknown values
+FLO_UNKNOWN_FLOW = 1e10  # value to use to represent unknown flow in flo file format
+
+
+def readFlowFile(filepath):
+    """read flow files in several formats. The resulting flow has shape height x width x 2.
+    For positions where there is no groundtruth available, the flow is set to np.nan.
+    Supports flo (Sintel), png (KITTI), npy (numpy), pfm (FlyingThings3D) and flo5 (Spring) file format.
+    filepath: path to the flow file
+    returns: flow with shape height x width x 2
+    """
+    if filepath.endswith(".flo"):
+        return readFloFlow(filepath)
+    elif filepath.endswith(".png"):
+        return readPngFlow(filepath)
+    elif filepath.endswith(".npy"):
+        return readNpyFlow(filepath)
+    elif filepath.endswith(".pfm"):
+        return readPfmFlow(filepath)
+    elif filepath.endswith(".flo5"):
+        return readFlo5Flow(filepath)
+    else:
+        raise ValueError(f"readFlowFile: Unknown file format for {filepath}")
+
+
+def writeFlowFile(flow, filepath):
+    """write optical flow to file. Supports flo (Sintel), png (KITTI) and npy (numpy) file format.
+    flow: optical flow with shape height x width x 2. Invalid values should be represented as np.nan
+    filepath: file path where to write the flow
+    """
+    if not filepath:
+        raise ValueError("writeFlowFile: empty filepath")
+
+    if len(flow.shape) != 3 or flow.shape[2] != 2:
+        raise IOError(
+            f"writeFlowFile {filepath}: expected shape height x width x 2 but received {flow.shape}"
+        )
+
+    if flow.shape[0] > flow.shape[1]:
+        print(
+            f"write flo file {filepath}: Warning: Are you writing an upright image? Expected shape height x width x 2, got {flow.shape}"
+        )
+
+    if filepath.endswith(".flo"):
+        return writeFloFlow(flow, filepath)
+    elif filepath.endswith(".png"):
+        return writePngFlow(flow, filepath)
+    elif filepath.endswith(".npy"):
+        return writeNpyFile(flow, filepath)
+    elif filepath.endswith(".flo5"):
+        return writeFlo5File(flow, filepath)
+    else:
+        raise ValueError(f"writeFlowFile: Unknown file format for {filepath}")
+
+
+def readFloFlow(filepath):
+    """read optical flow from file stored in .flo file format as used in the Sintel dataset (Butler et al., 2012)
+    filepath: path to file where to read from
+    returns: flow as a numpy array with shape height x width x 2
+    ---
+    ".flo" file format used for optical flow evaluation
+
+    Stores 2-band float image for horizontal (u) and vertical (v) flow components.
+    Floats are stored in little-endian order.
+    A flow value is considered "unknown" if either |u| or |v| is greater than 1e9.
+
+    bytes  contents
+
+    0-3     tag: "PIEH" in ASCII, which in little endian happens to be the float 202021.25
+            (just a sanity check that floats are represented correctly)
+    4-7     width as an integer
+    8-11    height as an integer
+    12-end  data (width*height*2*4 bytes total)
+            the float values for u and v, interleaved, in row order, i.e.,
+            u[row0,col0], v[row0,col0], u[row0,col1], v[row0,col1], ...
+    """
+    if filepath is None:
+        raise IOError("read flo file: empty filename")
+
+    if not filepath.endswith(".flo"):
+        raise IOError(f"read flo file ({filepath}): extension .flo expected")
+
+    with open(filepath, "rb") as stream:
+        tag = struct.unpack("f", stream.read(4))[0]
+        width = struct.unpack("i", stream.read(4))[0]
+        height = struct.unpack("i", stream.read(4))[0]
+
+        if tag != FLO_TAG_FLOAT:  # simple test for correct endian-ness
+            raise IOError(
+                f"read flo file({filepath}): wrong tag (possibly due to big-endian machine?)"
+            )
+
+        # another sanity check to see that integers were read correctly (99999 should do the trick...)
+        if width < 1 or width > 99999:
+            raise IOError(f"read flo file({filepath}): illegal width {width}")
+
+        if height < 1 or height > 99999:
+            raise IOError(f"read flo file({filepath}): illegal height {height}")
+
+        nBands = 2
+        flow = []
+
+        n = nBands * width
+        for _ in range(height):
+            data = stream.read(n * 4)
+            if data is None:
+                raise IOError(f"read flo file({filepath}): file is too short")
+            data = np.asarray(struct.unpack(f"{n}f", data))
+            data = data.reshape((width, nBands))
+            flow.append(data)
+
+        if stream.read(1) != b"":
+            raise IOError(f"read flo file({filepath}): file is too long")
+
+        flow = np.asarray(flow)
+        # unknown values are set to nan
+        flow[np.abs(flow) > FLO_UNKNOWN_FLOW_THRESH] = np.nan
+
+        return flow
+
+
+def writeFloFlow(flow, filepath):
+    """
+    write optical flow in .flo format to file as used in the Sintel dataset (Butler et al., 2012)
+    flow: optical flow with shape height x width x 2
+    filepath: optical flow file path to be saved
+    ---
+    ".flo" file format used for optical flow evaluation
+
+    Stores 2-band float image for horizontal (u) and vertical (v) flow components.
+    Floats are stored in little-endian order.
+    A flow value is considered "unknown" if either |u| or |v| is greater than 1e9.
+
+    bytes  contents
+
+    0-3     tag: "PIEH" in ASCII, which in little endian happens to be the float 202021.25
+            (just a sanity check that floats are represented correctly)
+    4-7     width as an integer
+    8-11    height as an integer
+    12-end  data (width*height*2*4 bytes total)
+            the float values for u and v, interleaved, in row order, i.e.,
+            u[row0,col0], v[row0,col0], u[row0,col1], v[row0,col1], ...
+    """
+
+    height, width, nBands = flow.shape
+
+    with open(filepath, "wb") as f:
+        if f is None:
+            raise IOError(f"write flo file {filepath}: file could not be opened")
+
+        # write header
+        result = f.write(FLO_TAG_STRING.encode("ascii"))
+        result += f.write(struct.pack("i", width))
+        result += f.write(struct.pack("i", height))
+        if result != 12:
+            raise IOError(f"write flo file {filepath}: problem writing header")
+
+        # write content
+        n = nBands * width
+        for i in range(height):
+            data = flow[i, :, :].flatten()
+            data[np.isnan(data)] = FLO_UNKNOWN_FLOW
+            result = f.write(struct.pack(f"{n}f", *data))
+            if result != n * 4:
+                raise IOError(f"write flo file {filepath}: problem writing row {i}")
+
+
+def readPngFlow(filepath):
+    """read optical flow from file stored in png file format as used in the KITTI 12 (Geiger et al., 2012) and KITTI 15 (Menze et al., 2015) dataset.
+    filepath: path to file where to read from
+    returns: flow as a numpy array with shape height x width x 2. Invalid values are represented as np.nan
+    """
+    # adapted from https://github.com/liruoteng/OpticalFlowToolkit
+    flow_object = png.Reader(filename=filepath)
+    flow_direct = flow_object.asDirect()
+    flow_data = list(flow_direct[2])
+    (w, h) = flow_direct[3]["size"]
+    flow = np.zeros((h, w, 3), dtype=np.float64)
+    for i in range(len(flow_data)):
+        flow[i, :, 0] = flow_data[i][0::3]
+        flow[i, :, 1] = flow_data[i][1::3]
+        flow[i, :, 2] = flow_data[i][2::3]
+
+    invalid_idx = flow[:, :, 2] == 0
+    flow[:, :, 0:2] = (flow[:, :, 0:2] - 2**15) / 64.0
+    flow[invalid_idx, 0] = np.nan
+    flow[invalid_idx, 1] = np.nan
+    return flow[:, :, :2]
+
+
+def writePngFlow(flow, filename):
+    """write optical flow to file png file format as used in the KITTI 12 (Geiger et al., 2012) and KITTI 15 (Menze et al., 2015) dataset.
+    flow: optical flow in shape height x width x 2, invalid values should be represented as np.nan
+    filepath: path to file where to write to
+    """
+    flow = 64.0 * flow + 2**15
+    width = flow.shape[1]
+    height = flow.shape[0]
+    valid_map = np.ones([flow.shape[0], flow.shape[1], 1])
+    valid_map[np.isnan(flow[:, :, 0]) | np.isnan(flow[:, :, 1])] = 0
+    flow = np.nan_to_num(flow)
+    flow = np.concatenate([flow, valid_map], axis=-1)
+    flow = np.clip(flow, 0, 2**16 - 1)
+    flow = flow.astype(np.uint16)
+    flow = np.reshape(flow, (-1, width * 3))
+    with open(filename, "wb") as f:
+        writer = png.Writer(width=width, height=height, bitdepth=16, greyscale=False)
+        writer.write(f, flow)
+
+
+def readNpyFlow(filepath):
+    """read numpy array from file.
+    filepath: file to read from
+    returns: numpy array
+    """
+    return np.load(filepath)
+
+
+def writeNpyFile(arr, filepath):
+    """write numpy array to file.
+    arr: numpy array to write
+    filepath: file to write to
+    """
+    np.save(filepath, arr)
+
+
+def writeFlo5File(flow, filename):
+    with h5py.File(filename, "w") as f:
+        f.create_dataset("flow", data=flow, compression="gzip", compression_opts=5)
+
+
+def readFlo5Flow(filename):
+    with h5py.File(filename, "r") as f:
+        if "flow" not in f.keys():
+            raise IOError(
+                f"File {filename} does not have a 'flow' key. Is this a valid flo5 file?"
+            )
+        return f["flow"][()]
+
+
+def readPfmFlow(filepath):
+    """read optical flow from file stored in pfm file format as used in the FlyingThings3D (Mayer et al., 2016) dataset.
+    filepath: path to file where to read from
+    returns: flow as a numpy array with shape height x width x 2.
+    """
+    flow = readPfmFile(filepath)
+    if len(flow.shape) != 3:
+        raise IOError(
+            f"read pfm flow: PFM file has wrong shape (assumed to be w x h x 3): {flow.shape}"
+        )
+    if flow.shape[2] != 3:
+        raise IOError(
+            f"read pfm flow: PFM file has wrong shape (assumed to be w x h x 3): {flow.shape}"
+        )
+    # remove third channel -> is all zeros
+    return flow[:, :, :2]
+
+
+def readPfmFile(filepath):
+    """
+    adapted from https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html
+    """
+    file = open(filepath, "rb")
+
+    color = None
+    width = None
+    height = None
+    scale = None
+    endian = None
+
+    header = file.readline().rstrip()
+    if header.decode("ascii") == "PF":
+        color = True
+    elif header.decode("ascii") == "Pf":
+        color = False
+    else:
+        raise Exception("Not a PFM file.")
+
+    dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
+    if dim_match:
+        width, height = list(map(int, dim_match.groups()))
+    else:
+        raise Exception("Malformed PFM header.")
+
+    scale = float(file.readline().decode("ascii").rstrip())
+    if scale < 0:  # little-endian
+        endian = "<"
+        scale = -scale
+    else:
+        endian = ">"  # big-endian
+
+    data = np.fromfile(file, endian + "f")
+    shape = (height, width, 3) if color else (height, width)
+
+    data = np.reshape(data, shape)
+    data = np.flipud(data)
+    return data  # , scale
+
+
+def writePfmFile(image, filepath):
+    """
+    adapted from https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html
+    """
+    scale = 1
+    file = open(filepath, "wb")
+
+    color = None
+
+    if image.dtype.name != "float32":
+        raise Exception("Image dtype must be float32.")
+
+    image = np.flipud(image)
+
+    if len(image.shape) == 3 and image.shape[2] == 3:  # color image
+        color = True
+    elif (
+        len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
+    ):  # greyscale
+        color = False
+    else:
+        raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
+
+    file.write("PF\n" if color else "Pf\n".encode())
+    file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
+
+    endian = image.dtype.byteorder
+
+    if endian == "<" or endian == "=" and sys.byteorder == "little":
+        scale = -scale
+
+    file.write("%f\n".encode() % scale)
+
+    image.tofile(file)
+
+
+def readDispFile(filepath):
+    """read disparity (or disparity change) from file. The resulting numpy array has shape height x width.
+    For positions where there is no groundtruth available, the value is set to np.nan.
+    Supports png (KITTI), npy (numpy) and pfm (FlyingThings3D) file format.
+    filepath: path to the flow file
+    returns: disparity with shape height x width
+    """
+    if filepath.endswith(".png"):
+        return readPngDisp(filepath)
+    elif filepath.endswith(".npy"):
+        return readNpyFlow(filepath)
+    elif filepath.endswith(".pfm"):
+        return readPfmDisp(filepath)
+    elif filepath.endswith(".dsp5"):
+        return readDsp5Disp(filepath)
+    else:
+        raise ValueError(f"readDispFile: Unknown file format for {filepath}")
+
+
+def readPngDisp(filepath):
+    """read disparity from file stored in png file format as used in the KITTI 12 (Geiger et al., 2012) and KITTI 15 (Menze et al., 2015) dataset.
+    filepath: path to file where to read from
+    returns: disparity as a numpy array with shape height x width. Invalid values are represented as np.nan
+    """
+    # adapted from https://github.com/liruoteng/OpticalFlowToolkit
+    image_object = png.Reader(filename=filepath)
+    image_direct = image_object.asDirect()
+    image_data = list(image_direct[2])
+    (w, h) = image_direct[3]["size"]
+    channel = len(image_data[0]) // w
+    if channel != 1:
+        raise IOError("read png disp: assumed channels to be 1!")
+    disp = np.zeros((h, w), dtype=np.float64)
+    for i in range(len(image_data)):
+        disp[i, :] = image_data[i][:]
+    disp[disp == 0] = np.nan
+    return disp[:, :] / 256.0
+
+
+def readPfmDisp(filepath):
+    """read disparity or disparity change from file stored in pfm file format as used in the FlyingThings3D (Mayer et al., 2016) dataset.
+    filepath: path to file where to read from
+    returns: disparity as a numpy array with shape height x width. Invalid values are represented as np.nan
+    """
+    disp = readPfmFile(filepath)
+    if len(disp.shape) != 2:
+        raise IOError(
+            f"read pfm disp: PFM file has wrong shape (assumed to be w x h): {disp.shape}"
+        )
+    return disp
+
+
+def writePngDisp(disp, filepath):
+    """write disparity to png file format as used in the KITTI 12 (Geiger et al., 2012) and KITTI 15 (Menze et al., 2015) dataset.
+    disp: disparity in shape height x width, invalid values should be represented as np.nan
+    filepath: path to file where to write to
+    """
+    disp = 256 * disp
+    width = disp.shape[1]
+    height = disp.shape[0]
+    disp = np.clip(disp, 0, 2**16 - 1)
+    disp = np.nan_to_num(disp).astype(np.uint16)
+    disp = np.reshape(disp, (-1, width))
+    with open(filepath, "wb") as f:
+        writer = png.Writer(width=width, height=height, bitdepth=16, greyscale=True)
+        writer.write(f, disp)
+
+
+def writeDsp5File(disp, filename):
+    with h5py.File(filename, "w") as f:
+        f.create_dataset("disparity", data=disp, compression="gzip", compression_opts=5)
+
+
+def readDsp5Disp(filename):
+    with h5py.File(filename, "r") as f:
+        if "disparity" not in f.keys():
+            raise IOError(
+                f"File {filename} does not have a 'disparity' key. Is this a valid dsp5 file?"
+            )
+        return f["disparity"][()]
+
+
+def writeDispFile(disp, filepath):
+    """write disparity to file. Supports png (KITTI) and npy (numpy) file format.
+    disp: disparity with shape height x width. Invalid values should be represented as np.nan
+    filepath: file path where to write the flow
+    """
+    if not filepath:
+        raise ValueError("writeDispFile: empty filepath")
+
+    if len(disp.shape) != 2:
+        raise IOError(
+            f"writeDispFile {filepath}: expected shape height x width but received {disp.shape}"
+        )
+
+    if disp.shape[0] > disp.shape[1]:
+        print(
+            f"writeDispFile {filepath}: Warning: Are you writing an upright image? Expected shape height x width, got {disp.shape}"
+        )
+
+    if filepath.endswith(".png"):
+        writePngDisp(disp, filepath)
+    elif filepath.endswith(".npy"):
+        writeNpyFile(disp, filepath)
+    elif filepath.endswith(".dsp5"):
+        writeDsp5File(disp, filepath)
+
+
+def readKITTIObjMap(filepath):
+    assert filepath.endswith(".png")
+    return np.asarray(Image.open(filepath)) > 0
+
+
+def readKITTIIntrinsics(filepath, image=2):
+    assert filepath.endswith(".txt")
+
+    with open(filepath) as f:
+        reader = csv.reader(f, delimiter=" ")
+        for row in reader:
+            if row[0] == f"K_{image:02d}:":
+                K = np.array(row[1:], dtype=np.float32).reshape(3, 3)
+                kvec = np.array([K[0, 0], K[1, 1], K[0, 2], K[1, 2]])
+                return kvec
+
+
+def writePngMapFile(map_, filename):
+    Image.fromarray(map_).save(filename)
diff --git a/extern/CUT3R/datasets_preprocess/generate_set_arkitscenes.py b/extern/CUT3R/datasets_preprocess/generate_set_arkitscenes.py
new file mode 100644
index 0000000000000000000000000000000000000000..21ed0439f01ae88df2a6d6185b4baf05f648c33c
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/generate_set_arkitscenes.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+"""
+Preprocess scenes by sorting images and generating image/video collections.
+
+This script processes scenes in parallel using a thread pool, updating metadata
+with sorted images, trajectories, intrinsics, and generating pair, image collection,
+and video collection data. The processed metadata is saved to a new file in each scene directory.
+
+Usage:
+    python generate_set_arkitscenes.py --root /path/to/data --splits Training Test --max_interval 5.0 --num_workers 8
+"""
+
+import os
+import os.path as osp
+import argparse
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from tqdm import tqdm
+
+
+def get_timestamp(img_name):
+    """
+    Extract the timestamp from an image filename.
+    Assumes the timestamp is the last underscore-separated token in the name (before the file extension).
+
+    Args:
+        img_name (str): The image filename.
+
+    Returns:
+        float: The extracted timestamp.
+    """
+    return float(img_name[:-4].split("_")[-1])
+
+
+def process_scene(root, split, scene, max_interval):
+    """
+    Process a single scene by sorting its images by timestamp, updating trajectories,
+    intrinsics, and pairings, and generating image/video collections.
+
+    Args:
+        root (str): Root directory of the dataset.
+        split (str): The dataset split (e.g., 'Training', 'Test').
+        scene (str): The scene identifier.
+        max_interval (float): Maximum allowed time interval (in seconds) between images to consider them in the same video collection.
+    """
+    scene_dir = osp.join(root, split, scene)
+    metadata_path = osp.join(scene_dir, "scene_metadata.npz")
+
+    # Load the scene metadata
+    with np.load(metadata_path) as data:
+        images = data["images"]
+        trajectories = data["trajectories"]
+        intrinsics = data["intrinsics"]
+        pairs = data["pairs"]
+
+    # Sort images by timestep
+    imgs_with_indices = sorted(enumerate(images), key=lambda x: x[1])
+    indices, images = zip(*imgs_with_indices)
+    indices = np.array(indices)
+    index2sorted = {index: i for i, index in enumerate(indices)}
+
+    # Reorder trajectories and intrinsics based on the new image order
+    trajectories = trajectories[indices]
+    intrinsics = intrinsics[indices]
+
+    # Update pair indices (each pair is (id1, id2, score))
+    pairs = [(index2sorted[id1], index2sorted[id2], score) for id1, id2, score in pairs]
+
+    # Form image_collection: mapping from an image id to a list of (other image id, score)
+    image_collection = {}
+    for id1, id2, score in pairs:
+        image_collection.setdefault(id1, []).append((id2, score))
+
+    # Form video_collection: for each image, collect subsequent images within the max_interval time window
+    video_collection = {}
+    for i, image in enumerate(images):
+        j = i + 1
+        for j in range(i + 1, len(images)):
+            if get_timestamp(images[j]) - get_timestamp(image) > max_interval:
+                break
+        video_collection[i] = list(range(i + 1, j))
+
+    # Save the new metadata
+    output_path = osp.join(scene_dir, "new_scene_metadata.npz")
+    np.savez(
+        output_path,
+        images=images,
+        trajectories=trajectories,
+        intrinsics=intrinsics,
+        pairs=pairs,
+        image_collection=image_collection,
+        video_collection=video_collection,
+    )
+    print(f"Processed scene: {scene}")
+
+
+def main(args):
+    """
+    Main function to process scenes across specified dataset splits in parallel.
+    """
+    root = args.root
+    splits = args.splits
+    max_interval = args.max_interval
+    num_workers = args.num_workers
+
+    futures = []
+
+    # Create a ThreadPoolExecutor for parallel processing
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        for split in splits:
+            all_meta_path = osp.join(root, split, "all_metadata.npz")
+            with np.load(all_meta_path) as data:
+                scenes = data["scenes"]
+
+            # Submit processing tasks for each scene in the current split
+            for scene in scenes:
+                futures.append(
+                    executor.submit(process_scene, root, split, scene, max_interval)
+                )
+
+        # Use tqdm to display a progress bar as futures complete
+        for future in tqdm(
+            as_completed(futures), total=len(futures), desc="Processing scenes"
+        ):
+            # This will raise any exceptions caught during scene processing.
+            future.result()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Preprocess scene data to update metadata with sorted images and collections."
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        default="",
+        help="Root directory containing the dataset splits.",
+    )
+    parser.add_argument(
+        "--splits",
+        type=str,
+        nargs="+",
+        default=["Training", "Test"],
+        help="List of dataset splits to process (e.g., Training Test).",
+    )
+    parser.add_argument(
+        "--max_interval",
+        type=float,
+        default=5.0,
+        help="Maximum time interval (in seconds) between images to consider them in the same video sequence.",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=8,
+        help="Number of worker threads for parallel processing.",
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/extern/CUT3R/datasets_preprocess/generate_set_scannet.py b/extern/CUT3R/datasets_preprocess/generate_set_scannet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c1643b7f994b482d32cf4df8694ce080d3514cf
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/generate_set_scannet.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""
+Preprocess ScanNet scenes to generate video collections.
+
+This script processes each scene in specified splits by reading the image filenames
+from the "color" folder, grouping images into video sequences based on a maximum
+timestamp interval, and then saving the per-scene metadata as a NumPy .npz file.
+
+Usage:
+    python generate_set_scannet.py --root /path/to/processed_scannet \
+        --splits scans_test scans_train --max_interval 150 --num_workers 8
+"""
+
+import os
+import os.path as osp
+import argparse
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from tqdm import tqdm
+
+
+def get_timestamp(img_name):
+    """
+    Convert an image basename to an integer timestamp.
+
+    For ScanNet data, it is assumed that the basename is an integer string.
+
+    Args:
+        img_name (str): Image basename (without extension).
+
+    Returns:
+        int: The timestamp as an integer.
+    """
+    return int(img_name)
+
+
+def process_scene(root, split, scene, max_interval):
+    """
+    Process a single scene: group images into video sequences and save metadata.
+
+    Args:
+        root (str): Root directory for the processed data.
+        split (str): Name of the split (e.g., 'scans_test', 'scans_train').
+        scene (str): Name of the scene directory.
+        max_interval (int): Maximum allowed difference in timestamps for grouping images.
+    """
+    scene_dir = osp.join(root, split, scene)
+    color_dir = osp.join(scene_dir, "color")
+    # depth_dir and camera_dir are defined in case you need them in future modifications.
+    # depth_dir = osp.join(scene_dir, 'depth')
+    # camera_dir = osp.join(scene_dir, 'cam')
+
+    # Get all image basenames from the color folder (without file extension)
+    basenames = sorted(
+        [f.split(".")[0] for f in os.listdir(color_dir) if f.endswith(".jpg")],
+        key=lambda x: get_timestamp(x),
+    )
+
+    video_collection = {}
+    for i, image in enumerate(basenames):
+        video_collection[i] = []
+        for j in range(i + 1, len(basenames)):
+            # Group images that fall within max_interval seconds of the reference image.
+            if get_timestamp(basenames[j]) - get_timestamp(image) > max_interval:
+                break
+            video_collection[i].append(j)
+
+    # Save the scene metadata (list of basenames and the video collection) to an NPZ file.
+    out_path = osp.join(scene_dir, "new_scene_metadata.npz")
+    np.savez(out_path, images=basenames, video_collection=video_collection)
+    print(f"Processed scene: {scene} (split: {split})")
+
+
+def main(args):
+    root = args.root
+    splits = args.splits
+    max_interval = args.max_interval
+    num_workers = args.num_workers
+
+    futures = []
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        for split in splits:
+            split_dir = osp.join(root, split)
+            if not osp.isdir(split_dir):
+                print(
+                    f"Warning: Split directory '{split_dir}' does not exist; skipping."
+                )
+                continue
+            scenes = os.listdir(split_dir)
+            for scene in scenes:
+                futures.append(
+                    executor.submit(process_scene, root, split, scene, max_interval)
+                )
+        # Use tqdm to display progress as futures complete.
+        for future in tqdm(
+            as_completed(futures), total=len(futures), desc="Processing scenes"
+        ):
+            # This will re-raise any exceptions from process_scene.
+            future.result()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Preprocess ScanNet scenes to create video collections based on image timestamps."
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        default="",
+        help="Root directory containing the processed ScanNet splits.",
+    )
+    parser.add_argument(
+        "--splits",
+        type=str,
+        nargs="+",
+        default=["scans_test", "scans_train"],
+        help="List of split directories to process (e.g., scans_test scans_train).",
+    )
+    parser.add_argument(
+        "--max_interval",
+        type=int,
+        default=150,
+        help="Maximum allowed timestamp difference (in integer units) for grouping images.",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=8,
+        help="Number of worker threads for parallel processing.",
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/extern/CUT3R/datasets_preprocess/generate_set_scannetpp.py b/extern/CUT3R/datasets_preprocess/generate_set_scannetpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..9977a5d9081e8f1c0c9f82995b05a39d82096d5f
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/generate_set_scannetpp.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""
+Preprocess processed_scannetpp scenes to update scene metadata.
+
+This script reads each scene's "scene_metadata.npz", sorts images by timestamp,
+updates trajectories, intrinsics, and pair indices, and builds two collections:
+  - image_collection: For each image, stores pairs (other image index, score)
+  - video_collection: For each image, groups subsequent images whose timestamps
+                      differ by at most a given max_interval (and share the same
+                      first character in the image name).
+
+The new metadata is saved as "new_scene_metadata.npz" in each scene folder.
+
+Usage:
+    python generate_set_scannetpp.py --root /path/to/processed_scannetpp \
+        --max_interval 150 --num_workers 8
+"""
+
+import os
+import os.path as osp
+import argparse
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from tqdm import tqdm
+
+
+def get_timestamp(img_name):
+    """
+    Convert an image name to a timestamp (integer).
+
+    If the image name starts with 'DSC', the timestamp is the integer part after 'DSC'.
+    Otherwise, it is assumed the image name has an underscore, and the second element is used.
+
+    Args:
+        img_name (str): The image basename (without extension).
+
+    Returns:
+        int: The extracted timestamp.
+    """
+    if img_name.startswith("DSC"):
+        return int(img_name[3:])
+    else:
+        return int(img_name.split("_")[1])
+
+
+def process_scene(root, scene, max_interval):
+    """
+    Process a single scene: sort images, update trajectories/intrinsics/pairs, and
+    form image and video collections. Save the updated metadata.
+
+    Args:
+        root (str): Root directory containing scene folders.
+        scene (str): Scene folder name.
+        max_interval (int): Maximum allowed difference (in timestamp units) for video grouping.
+    """
+    scene_dir = osp.join(root, scene)
+    metadata_path = osp.join(scene_dir, "scene_metadata.npz")
+    with np.load(metadata_path, allow_pickle=True) as data:
+        images = data["images"]
+        trajectories = data["trajectories"]
+        intrinsics = data["intrinsics"]
+        pairs = data["pairs"]
+
+    # Sort images by timestamp.
+    imgs_with_indices = sorted(enumerate(images), key=lambda x: x[1])
+    indices, images = zip(*imgs_with_indices)
+    indices = np.array(indices)
+    index2sorted = {index: i for i, index in enumerate(indices)}
+
+    # Update trajectories and intrinsics arrays according to the new order.
+    trajectories = trajectories[indices]
+    intrinsics = intrinsics[indices]
+
+    # Update pairs (each pair is (id1, id2, score)) with new indices.
+    pairs = [(index2sorted[id1], index2sorted[id2], score) for id1, id2, score in pairs]
+
+    # Build image_collection: for each pair, verify that both image files exist.
+    image_collection = {}
+    for id1, id2, score in pairs:
+        img1 = images[id1]
+        img2 = images[id2]
+        img1_path = osp.join(scene_dir, "images", img1 + ".jpg")
+        img2_path = osp.join(scene_dir, "images", img2 + ".jpg")
+        if not (osp.exists(img1_path) and osp.exists(img2_path)):
+            continue
+        if id1 not in image_collection:
+            image_collection[id1] = []
+        image_collection[id1].append((id2, score))
+
+    # Build video_collection: for each image, group subsequent images if:
+    #  1. Their timestamp difference is at most max_interval.
+    #  2. Their name's first character is the same as the current image.
+    video_collection = {}
+    for i, image in enumerate(images):
+        img_path = osp.join(scene_dir, "images", image + ".jpg")
+        if not osp.exists(img_path):
+            continue
+        video_collection[i] = []
+        for j in range(i + 1, len(images)):
+            next_img_path = osp.join(scene_dir, "images", images[j] + ".jpg")
+            if not osp.exists(next_img_path):
+                continue
+            if (
+                get_timestamp(images[j]) - get_timestamp(image) > max_interval
+                or images[j][0] != image[0]
+            ):
+                break
+            video_collection[i].append(j)
+
+    # Save the updated metadata to a new file.
+    out_path = osp.join(scene_dir, "new_scene_metadata.npz")
+    np.savez(
+        out_path,
+        images=images,
+        trajectories=trajectories,
+        intrinsics=intrinsics,
+        pairs=pairs,
+        image_collection=image_collection,
+        video_collection=video_collection,
+    )
+    print(f"Processed scene: {scene}")
+
+
+def main(args):
+    root = args.root
+    max_interval = args.max_interval
+    num_workers = args.num_workers
+
+    # Load the list of scenes from the 'all_metadata.npz' file.
+    all_metadata_path = osp.join(root, "all_metadata.npz")
+    with np.load(all_metadata_path, allow_pickle=True) as data:
+        scenes = data["scenes"]
+
+    # Process scenes in parallel.
+    futures = []
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        for scene in scenes:
+            futures.append(executor.submit(process_scene, root, scene, max_interval))
+        for future in tqdm(
+            as_completed(futures), total=len(futures), desc="Processing scenes"
+        ):
+            # This will raise any exceptions from process_scene.
+            future.result()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Preprocess processed_scannetpp scenes to update scene metadata."
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        required=True,
+        help="Root directory containing processed_scannetpp scene folders.",
+    )
+    parser.add_argument(
+        "--max_interval",
+        type=int,
+        default=150,
+        help="Maximum timestamp interval for grouping images (default: 150).",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=8,
+        help="Number of worker threads for parallel processing (default: 8).",
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/extern/CUT3R/datasets_preprocess/merge_dl3dv.py b/extern/CUT3R/datasets_preprocess/merge_dl3dv.py
new file mode 100644
index 0000000000000000000000000000000000000000..c53fb85ec072bb0a52236d003fa06c8a638c8128
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/merge_dl3dv.py
@@ -0,0 +1,85 @@
+import os
+import shutil
+from tqdm import tqdm
+
+# Set these paths to your original and moved locations.
+src_base = "/path/to/processed_dl3dv"  # original location
+dst_base = "processed_dl3dv_ours"  # current (moved) location
+
+# Set dry_run to True for testing (no changes made), and False to perform the actions.
+dry_run = False
+
+def merge_directories(source_dir, destination_dir, dry_run=False):
+    """
+    Merge all contents from source_dir into destination_dir.
+    If an item already exists in destination_dir:
+      - For files: remove the destination file and move the source file.
+      - For directories: merge them recursively.
+    After moving items, empty directories are removed.
+    """
+    for item in os.listdir(source_dir):
+        source_item = os.path.join(source_dir, item)
+        dest_item = os.path.join(destination_dir, item)
+        if os.path.isdir(source_item):
+            if os.path.exists(dest_item):
+                # Recursively merge subdirectories.
+                merge_directories(source_item, dest_item, dry_run=dry_run)
+                # Remove the source subdirectory if empty.
+                if not os.listdir(source_item):
+                    if dry_run:
+                        print(f"[Dry-run] Would remove empty directory: {source_item}")
+                    else:
+                        os.rmdir(source_item)
+            else:
+                if dry_run:
+                    print(f"[Dry-run] Would move directory: {source_item} -> {dest_item}")
+                else:
+                    shutil.move(source_item, dest_item)
+        else:
+            # For files: if a file already exists at the destination, remove it.
+            if os.path.exists(dest_item):
+                if dry_run:
+                    print(f"[Dry-run] Would remove existing file: {dest_item}")
+                else:
+                    os.remove(dest_item)
+            if dry_run:
+                print(f"[Dry-run] Would move file: {source_item} -> {dest_item}")
+            else:
+                shutil.move(source_item, dest_item)
+
+# Build a list of relative folder paths in dst_base.
+# This assumes the structure is: dst_base/f1/f2
+all_folders = []
+for f1 in os.listdir(dst_base):
+    f1_path = os.path.join(dst_base, f1)
+    if not os.path.isdir(f1_path):
+        continue
+    for f2 in os.listdir(f1_path):
+        all_folders.append(os.path.join(f1, f2))
+
+# Process each folder and move/merge it back to the original location.
+for folder in tqdm(all_folders, desc="Moving folders back"):
+    original_folder = os.path.join(src_base, folder)  # target location in the original path
+    moved_folder = os.path.join(dst_base, folder)       # current location
+
+    # Ensure the parent directory of the original folder exists.
+    parent_dir = os.path.dirname(original_folder)
+    if dry_run:
+        if not os.path.exists(parent_dir):
+            print(f"[Dry-run] Would create directory: {parent_dir}")
+    else:
+        os.makedirs(parent_dir, exist_ok=True)
+
+    if not os.path.exists(original_folder):
+        if dry_run:
+            print(f"[Dry-run] Would move folder: {moved_folder} -> {original_folder}")
+        else:
+            shutil.move(moved_folder, original_folder)
+    else:
+        merge_directories(moved_folder, original_folder, dry_run=dry_run)
+        # Remove the moved folder if it becomes empty.
+        if not os.listdir(moved_folder):
+            if dry_run:
+                print(f"[Dry-run] Would remove empty directory: {moved_folder}")
+            else:
+                os.rmdir(moved_folder)
diff --git a/extern/CUT3R/datasets_preprocess/path_to_root.py b/extern/CUT3R/datasets_preprocess/path_to_root.py
new file mode 100644
index 0000000000000000000000000000000000000000..25e51d4ae6c09e2a3e1885d0cbc50422dd113f1a
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/path_to_root.py
@@ -0,0 +1,14 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# DUSt3R repo root import
+# --------------------------------------------------------
+
+import sys
+import os.path as path
+
+HERE_PATH = path.normpath(path.dirname(__file__))
+DUST3R_REPO_PATH = path.normpath(path.join(HERE_PATH, "../"))
+# workaround for sibling import
+sys.path.insert(0, DUST3R_REPO_PATH)
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_3dkb.py b/extern/CUT3R/datasets_preprocess/preprocess_3dkb.py
new file mode 100644
index 0000000000000000000000000000000000000000..27e0b9f77ada16748f038819c5fa9670ad863fab
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_3dkb.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+"""
+Process 3D Ken Burns data by selecting random view types, copying images and depth files,
+and computing camera intrinsics from a field-of-view value. The output files are stored in an
+organized folder structure.
+
+Usage:
+    python preprocess_3dkb.py --root /path/to/data_3d_ken_burns \
+                           --out_dir /path/to/processed_3dkb \
+                           [--num_workers 4] [--seed 42]
+"""
+
+import os
+import json
+import random
+import shutil
+from functools import partial
+from pathlib import Path
+import argparse
+
+import cv2  # noqa: F401; cv2 is imported to ensure OpenEXR support.
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+# Ensure OpenCV can read OpenEXR files.
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+
+
+def fov_to_intrinsic_matrix(width, height, fov_deg, fov_type="horizontal"):
+    """
+    Converts field of view (FOV) in degrees to a camera intrinsic matrix.
+
+    Args:
+        width (int): Image width in pixels.
+        height (int): Image height in pixels.
+        fov_deg (float): Field of view in degrees.
+        fov_type (str): 'horizontal' or 'vertical'; determines which FOV is used.
+
+    Returns:
+        np.ndarray: A 3x3 camera intrinsic matrix.
+
+    Raises:
+        ValueError: If width or height is non-positive or if fov_deg is not in (0, 180).
+    """
+    if width <= 0 or height <= 0:
+        raise ValueError("Image width and height must be positive numbers.")
+    if not (0 < fov_deg < 180):
+        raise ValueError("FOV must be between 0 and 180 degrees (non-inclusive).")
+    if fov_type not in ["horizontal", "vertical"]:
+        raise ValueError("fov_type must be either 'horizontal' or 'vertical'.")
+
+    fov_rad = np.deg2rad(fov_deg)
+
+    if fov_type == "horizontal":
+        f_x = width / (2 * np.tan(fov_rad / 2))
+        aspect_ratio = height / width
+        f_y = f_x * aspect_ratio
+    else:
+        f_y = height / (2 * np.tan(fov_rad / 2))
+        aspect_ratio = width / height
+        f_x = f_y * aspect_ratio
+
+    c_x = width / 2
+    c_y = height / 2
+    K = np.array([[f_x, 0, c_x], [0, f_y, c_y], [0, 0, 1]])
+    return K
+
+
+def process_basename(root, seq, basename, view_types, out_dir):
+    """
+    Processes a single basename: selects a random view type, copies the corresponding
+    image and depth file, and computes the camera intrinsics from the JSON metadata.
+
+    Args:
+        root (str): Root directory of the raw data.
+        seq (str): Sequence directory name.
+        basename (str): Basename (common identifier) for the files.
+        view_types (list): List of view types to choose from (e.g. ['bl', 'br', 'tl', 'tr']).
+        out_dir (str): Output directory where processed data will be saved.
+
+    Returns:
+        str or None: Returns an error message string on failure; otherwise, returns None.
+    """
+    # Select a random view type.
+    view_type = random.choice(view_types)
+
+    imgname = f"{basename}-{view_type}-image.png"
+    depthname = f"{basename}-{view_type}-depth.exr"
+
+    img_path = os.path.join(root, seq, imgname)
+    cam_path = os.path.join(root, seq, f"{basename}-meta.json")
+    depth_path = os.path.join(root, f"{seq}-depth", depthname)
+
+    # Prepare output directories.
+    out_seq_dir = os.path.join(out_dir, seq)
+    out_rgb_dir = os.path.join(out_seq_dir, "rgb")
+    out_depth_dir = os.path.join(out_seq_dir, "depth")
+    out_cam_dir = os.path.join(out_seq_dir, "cam")
+
+    # Output file paths.
+    out_img_path = os.path.join(out_rgb_dir, f"{basename}.png")
+    out_depth_path = os.path.join(out_depth_dir, f"{basename}.exr")
+    out_cam_path = os.path.join(out_cam_dir, f"{basename}.npz")
+
+    try:
+        # Load image using PIL and save as PNG.
+        with Image.open(img_path) as img:
+            W, H = img.size
+            img.save(out_img_path, format="PNG")
+
+        # Load camera JSON metadata.
+        with open(cam_path, "r") as f:
+            cam = json.load(f)
+        fov = cam["fltFov"]
+        K = fov_to_intrinsic_matrix(W, H, fov)
+
+        # Copy depth file.
+        shutil.copy(depth_path, out_depth_path)
+
+        # Save camera intrinsics.
+        np.savez(out_cam_path, intrinsics=K)
+
+    except Exception as e:
+        return f"Error processing {seq}/{basename}: {e}"
+
+    return None  # Success indicator
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Process raw 3D Ken Burns video data and generate processed images, depth maps, and camera intrinsics."
+    )
+    parser.add_argument(
+        "--root", type=str, required=True, help="Root directory of the raw data."
+    )
+    parser.add_argument(
+        "--out_dir",
+        type=str,
+        required=True,
+        help="Output directory for processed data.",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=None,
+        help="Number of worker processes to use (default: half of available CPUs).",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for reproducibility (default: 42).",
+    )
+    parser.add_argument(
+        "--view_types",
+        type=str,
+        nargs="+",
+        default=["bl", "br", "tl", "tr"],
+        help="List of view types to choose from (default: bl br tl tr).",
+    )
+    args = parser.parse_args()
+
+    # Set the random seed.
+    random.seed(args.seed)
+
+    root = args.root
+    out_dir = args.out_dir
+    view_types = args.view_types
+
+    # Determine number of worker processes.
+    num_workers = (
+        args.num_workers if args.num_workers is not None else (os.cpu_count() or 4) // 2
+    )
+
+    # Collect all sequence directories from root.
+    seq_dirs = [
+        d
+        for d in os.listdir(root)
+        if os.path.isdir(os.path.join(root, d)) and not d.endswith("-depth")
+    ]
+
+    # Pre-create output directory structure.
+    for seq in seq_dirs:
+        for subfolder in ["rgb", "depth", "cam"]:
+            (Path(out_dir) / seq / subfolder).mkdir(parents=True, exist_ok=True)
+
+    # Prepare list of tasks.
+    tasks = []
+    for seq in seq_dirs:
+        seq_path = os.path.join(root, seq)
+        # Assume JSON files contain metadata and have a name ending with "-meta.json".
+        json_files = [f for f in os.listdir(seq_path) if f.endswith(".json")]
+        # Remove the trailing "-meta.json" (10 characters) to get the basename.
+        basenames = sorted([f[:-10] for f in json_files])
+        for basename in basenames:
+            tasks.append((seq, basename))
+
+    # Define a partial function with fixed root, view_types, and out_dir.
+    process_func = partial(
+        process_basename, root, view_types=view_types, out_dir=out_dir
+    )
+
+    # Process tasks in parallel using ProcessPoolExecutor.
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        futures = {
+            executor.submit(process_func, seq, basename): (seq, basename)
+            for seq, basename in tasks
+        }
+        for future in tqdm(
+            as_completed(futures), total=len(futures), desc="Processing"
+        ):
+            error = future.result()
+            if error:
+                print(error)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_arkitscenes.py b/extern/CUT3R/datasets_preprocess/preprocess_arkitscenes.py
new file mode 100644
index 0000000000000000000000000000000000000000..924fd00bff45334f26b50a95003b58404541415e
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_arkitscenes.py
@@ -0,0 +1,445 @@
+import os
+import json
+import os.path as osp
+import decimal
+import argparse
+import math
+from bisect import bisect_left
+from PIL import Image
+import numpy as np
+import quaternion
+from scipy import interpolate
+import cv2
+from tqdm import tqdm
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--arkitscenes_dir",
+        default="data/dust3r_data/data_arkitscenes/raw",
+    )
+    parser.add_argument(
+        "--precomputed_pairs",
+        default="data/dust3r_data/data_arkitscenes/arkitscenes_pairs",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="data/dust3r_data/processed_arkitscenes",
+    )
+    return parser
+
+
+def value_to_decimal(value, decimal_places):
+    decimal.getcontext().rounding = decimal.ROUND_HALF_UP  # define rounding method
+    return decimal.Decimal(str(float(value))).quantize(
+        decimal.Decimal("1e-{}".format(decimal_places))
+    )
+
+
+def closest(value, sorted_list):
+    index = bisect_left(sorted_list, value)
+    if index == 0:
+        return sorted_list[0]
+    elif index == len(sorted_list):
+        return sorted_list[-1]
+    else:
+        value_before = sorted_list[index - 1]
+        value_after = sorted_list[index]
+        if value_after - value < value - value_before:
+            return value_after
+        else:
+            return value_before
+
+
+def get_up_vectors(pose_device_to_world):
+    return np.matmul(pose_device_to_world, np.array([[0.0], [-1.0], [0.0], [0.0]]))
+
+
+def get_right_vectors(pose_device_to_world):
+    return np.matmul(pose_device_to_world, np.array([[1.0], [0.0], [0.0], [0.0]]))
+
+
+def read_traj(traj_path):
+    quaternions = []
+    poses = []
+    timestamps = []
+    poses_p_to_w = []
+    with open(traj_path) as f:
+        traj_lines = f.readlines()
+        for line in traj_lines:
+            tokens = line.split()
+            assert len(tokens) == 7
+            traj_timestamp = float(tokens[0])
+
+            timestamps_decimal_value = value_to_decimal(traj_timestamp, 3)
+            timestamps.append(
+                float(timestamps_decimal_value)
+            )  # for spline interpolation
+
+            angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])]
+            r_w_to_p, _ = cv2.Rodrigues(np.asarray(angle_axis))
+            t_w_to_p = np.asarray(
+                [float(tokens[4]), float(tokens[5]), float(tokens[6])]
+            )
+
+            pose_w_to_p = np.eye(4)
+            pose_w_to_p[:3, :3] = r_w_to_p
+            pose_w_to_p[:3, 3] = t_w_to_p
+
+            pose_p_to_w = np.linalg.inv(pose_w_to_p)
+
+            r_p_to_w_as_quat = quaternion.from_rotation_matrix(pose_p_to_w[:3, :3])
+            t_p_to_w = pose_p_to_w[:3, 3]
+            poses_p_to_w.append(pose_p_to_w)
+            poses.append(t_p_to_w)
+            quaternions.append(r_p_to_w_as_quat)
+    return timestamps, poses, quaternions, poses_p_to_w
+
+
+def main(rootdir, pairsdir, outdir):
+    os.makedirs(outdir, exist_ok=True)
+
+    subdirs = ["Test", "Training"]
+    for subdir in subdirs:
+        # STEP 1: list all scenes
+        outsubdir = osp.join(outdir, subdir)
+        os.makedirs(outsubdir, exist_ok=True)
+        listfile = osp.join(pairsdir, subdir, "scene_list.json")
+        with open(listfile, "r") as f:
+            scene_dirs = json.load(f)
+
+        valid_scenes = []
+        for scene_subdir in tqdm(scene_dirs):
+            if not os.path.isdir(osp.join(rootdir, "Test", scene_subdir)):
+                if not os.path.isdir(osp.join(rootdir, "Training", scene_subdir)):
+                    continue
+                else:
+                    root_subdir = "Training"
+            else:
+                root_subdir = "Test"
+            out_scene_subdir = osp.join(outsubdir, scene_subdir)
+            os.makedirs(out_scene_subdir, exist_ok=True)
+
+            scene_dir = osp.join(rootdir, root_subdir, scene_subdir)
+            depth_dir = osp.join(scene_dir, "lowres_depth")
+            rgb_dir = osp.join(scene_dir, "vga_wide")
+            intrinsics_dir = osp.join(scene_dir, "vga_wide_intrinsics")
+            traj_path = osp.join(scene_dir, "lowres_wide.traj")
+
+            # STEP 2: read selected_pairs.npz
+            selected_pairs_path = osp.join(
+                pairsdir, subdir, scene_subdir, "selected_pairs.npz"
+            )
+            selected_npz = np.load(selected_pairs_path)
+            selection, pairs = selected_npz["selection"], selected_npz["pairs"]
+            selected_sky_direction_scene = str(selected_npz["sky_direction_scene"][0])
+            if len(selection) == 0 or len(pairs) == 0:
+                # not a valid scene
+                continue
+            valid_scenes.append(scene_subdir)
+
+            # STEP 3: parse the scene and export the list of valid (K, pose, rgb, depth) and convert images
+            scene_metadata_path = osp.join(out_scene_subdir, "scene_metadata.npz")
+            if osp.isfile(scene_metadata_path):
+                continue
+            else:
+                print(f"parsing {scene_subdir}")
+                # loads traj
+                timestamps, poses, quaternions, poses_cam_to_world = read_traj(
+                    traj_path
+                )
+
+                poses = np.array(poses)
+                quaternions = np.array(quaternions, dtype=np.quaternion)
+                quaternions = quaternion.unflip_rotors(quaternions)
+                timestamps = np.array(timestamps)
+
+                selected_images = [
+                    (basename, basename.split(".png")[0].split("_")[1])
+                    for basename in selection
+                ]
+                timestamps_selected = [
+                    float(frame_id) for _, frame_id in selected_images
+                ]
+
+                sky_direction_scene, trajectories, intrinsics, images = (
+                    convert_scene_metadata(
+                        scene_subdir,
+                        intrinsics_dir,
+                        timestamps,
+                        quaternions,
+                        poses,
+                        poses_cam_to_world,
+                        selected_images,
+                        timestamps_selected,
+                    )
+                )
+                assert selected_sky_direction_scene == sky_direction_scene
+
+                os.makedirs(os.path.join(out_scene_subdir, "vga_wide"), exist_ok=True)
+                os.makedirs(
+                    os.path.join(out_scene_subdir, "lowres_depth"), exist_ok=True
+                )
+                assert isinstance(sky_direction_scene, str)
+                all_exist = True
+                for basename in images:
+                    vga_wide_path = osp.join(rgb_dir, basename)
+                    depth_path = osp.join(depth_dir, basename)
+                    if not osp.isfile(vga_wide_path) or not osp.isfile(depth_path):
+                        all_exist = False
+                        break
+                if not all_exist:
+                    continue
+
+                for basename in images:
+                    img_out = os.path.join(
+                        out_scene_subdir, "vga_wide", basename.replace(".png", ".jpg")
+                    )
+                    depth_out = os.path.join(out_scene_subdir, "lowres_depth", basename)
+                    if osp.isfile(img_out) and osp.isfile(depth_out):
+                        continue
+
+                    vga_wide_path = osp.join(rgb_dir, basename)
+                    depth_path = osp.join(depth_dir, basename)
+
+                    img = Image.open(vga_wide_path)
+                    depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
+
+                    # rotate the image
+                    if sky_direction_scene == "RIGHT":
+                        try:
+                            img = img.transpose(Image.Transpose.ROTATE_90)
+                        except Exception:
+                            img = img.transpose(Image.ROTATE_90)
+                        depth = cv2.rotate(depth, cv2.ROTATE_90_COUNTERCLOCKWISE)
+                    elif sky_direction_scene == "LEFT":
+                        try:
+                            img = img.transpose(Image.Transpose.ROTATE_270)
+                        except Exception:
+                            img = img.transpose(Image.ROTATE_270)
+                        depth = cv2.rotate(depth, cv2.ROTATE_90_CLOCKWISE)
+                    elif sky_direction_scene == "DOWN":
+                        try:
+                            img = img.transpose(Image.Transpose.ROTATE_180)
+                        except Exception:
+                            img = img.transpose(Image.ROTATE_180)
+                        depth = cv2.rotate(depth, cv2.ROTATE_180)
+
+                    W, H = img.size
+                    if not osp.isfile(img_out):
+                        img.save(img_out)
+
+                    depth = cv2.resize(
+                        depth, (W, H), interpolation=cv2.INTER_NEAREST_EXACT
+                    )
+                    if not osp.isfile(
+                        depth_out
+                    ):  # avoid destroying the base dataset when you mess up the paths
+                        cv2.imwrite(depth_out, depth)
+
+                # save at the end
+                np.savez(
+                    scene_metadata_path,
+                    trajectories=trajectories,
+                    intrinsics=intrinsics,
+                    images=images,
+                    pairs=pairs,
+                )
+
+        outlistfile = osp.join(outsubdir, "scene_list.json")
+        for scene_subdir in valid_scenes:
+            scene_metadata_path = osp.join(
+                outsubdir, scene_subdir, "scene_metadata.npz"
+            )
+            if not osp.isfile(scene_metadata_path):
+                valid_scenes.remove(scene_subdir)
+        with open(outlistfile, "w") as f:
+            json.dump(valid_scenes, f)
+
+        # STEP 5: concat all scene_metadata.npz into a single file
+        scene_data = {}
+        for scene_subdir in valid_scenes:
+            scene_metadata_path = osp.join(
+                outsubdir, scene_subdir, "scene_metadata.npz"
+            )
+            with np.load(scene_metadata_path) as data:
+                trajectories = data["trajectories"]
+                intrinsics = data["intrinsics"]
+                images = data["images"]
+                pairs = data["pairs"]
+            scene_data[scene_subdir] = {
+                "trajectories": trajectories,
+                "intrinsics": intrinsics,
+                "images": images,
+                "pairs": pairs,
+            }
+        offset = 0
+        counts = []
+        scenes = []
+        sceneids = []
+        images = []
+        intrinsics = []
+        trajectories = []
+        pairs = []
+        for scene_idx, (scene_subdir, data) in enumerate(scene_data.items()):
+            num_imgs = data["images"].shape[0]
+            img_pairs = data["pairs"]
+
+            scenes.append(scene_subdir)
+            sceneids.extend([scene_idx] * num_imgs)
+
+            images.append(data["images"])
+
+            K = np.expand_dims(np.eye(3), 0).repeat(num_imgs, 0)
+            K[:, 0, 0] = [fx for _, _, fx, _, _, _ in data["intrinsics"]]
+            K[:, 1, 1] = [fy for _, _, _, fy, _, _ in data["intrinsics"]]
+            K[:, 0, 2] = [hw for _, _, _, _, hw, _ in data["intrinsics"]]
+            K[:, 1, 2] = [hh for _, _, _, _, _, hh in data["intrinsics"]]
+
+            intrinsics.append(K)
+            trajectories.append(data["trajectories"])
+
+            # offset pairs
+            img_pairs[:, 0:2] += offset
+            pairs.append(img_pairs)
+            counts.append(offset)
+
+            offset += num_imgs
+
+        images = np.concatenate(images, axis=0)
+        intrinsics = np.concatenate(intrinsics, axis=0)
+        trajectories = np.concatenate(trajectories, axis=0)
+        pairs = np.concatenate(pairs, axis=0)
+        np.savez(
+            osp.join(outsubdir, "all_metadata.npz"),
+            counts=counts,
+            scenes=scenes,
+            sceneids=sceneids,
+            images=images,
+            intrinsics=intrinsics,
+            trajectories=trajectories,
+            pairs=pairs,
+        )
+
+
+def convert_scene_metadata(
+    scene_subdir,
+    intrinsics_dir,
+    timestamps,
+    quaternions,
+    poses,
+    poses_cam_to_world,
+    selected_images,
+    timestamps_selected,
+):
+    # find scene orientation
+    sky_direction_scene, rotated_to_cam = find_scene_orientation(poses_cam_to_world)
+
+    # find/compute pose for selected timestamps
+    # most images have a valid timestamp / exact pose associated
+    timestamps_selected = np.array(timestamps_selected)
+    spline = interpolate.interp1d(timestamps, poses, kind="linear", axis=0)
+    interpolated_rotations = quaternion.squad(
+        quaternions, timestamps, timestamps_selected
+    )
+    interpolated_positions = spline(timestamps_selected)
+
+    trajectories = []
+    intrinsics = []
+    images = []
+    for i, (basename, frame_id) in enumerate(selected_images):
+        intrinsic_fn = osp.join(intrinsics_dir, f"{scene_subdir}_{frame_id}.pincam")
+        if not osp.exists(intrinsic_fn):
+            intrinsic_fn = osp.join(
+                intrinsics_dir, f"{scene_subdir}_{float(frame_id) - 0.001:.3f}.pincam"
+            )
+        if not osp.exists(intrinsic_fn):
+            intrinsic_fn = osp.join(
+                intrinsics_dir, f"{scene_subdir}_{float(frame_id) + 0.001:.3f}.pincam"
+            )
+        assert osp.exists(intrinsic_fn)
+        w, h, fx, fy, hw, hh = np.loadtxt(intrinsic_fn)  # PINHOLE
+
+        pose = np.eye(4)
+        pose[:3, :3] = quaternion.as_rotation_matrix(interpolated_rotations[i])
+        pose[:3, 3] = interpolated_positions[i]
+
+        images.append(basename)
+        if sky_direction_scene == "RIGHT" or sky_direction_scene == "LEFT":
+            intrinsics.append([h, w, fy, fx, hh, hw])  # swapped intrinsics
+        else:
+            intrinsics.append([w, h, fx, fy, hw, hh])
+        trajectories.append(
+            pose @ rotated_to_cam
+        )  # pose_cam_to_world @ rotated_to_cam = rotated(cam) to world
+
+    return sky_direction_scene, trajectories, intrinsics, images
+
+
+def find_scene_orientation(poses_cam_to_world):
+    if len(poses_cam_to_world) > 0:
+        up_vector = sum(get_up_vectors(p) for p in poses_cam_to_world) / len(
+            poses_cam_to_world
+        )
+        right_vector = sum(get_right_vectors(p) for p in poses_cam_to_world) / len(
+            poses_cam_to_world
+        )
+        up_world = np.array([[0.0], [0.0], [1.0], [0.0]])
+    else:
+        up_vector = np.array([[0.0], [-1.0], [0.0], [0.0]])
+        right_vector = np.array([[1.0], [0.0], [0.0], [0.0]])
+        up_world = np.array([[0.0], [0.0], [1.0], [0.0]])
+
+    # value between 0, 180
+    device_up_to_world_up_angle = (
+        np.arccos(np.clip(np.dot(np.transpose(up_world), up_vector), -1.0, 1.0)).item()
+        * 180.0
+        / np.pi
+    )
+    device_right_to_world_up_angle = (
+        np.arccos(
+            np.clip(np.dot(np.transpose(up_world), right_vector), -1.0, 1.0)
+        ).item()
+        * 180.0
+        / np.pi
+    )
+
+    up_closest_to_90 = abs(device_up_to_world_up_angle - 90.0) < abs(
+        device_right_to_world_up_angle - 90.0
+    )
+    if up_closest_to_90:
+        assert abs(device_up_to_world_up_angle - 90.0) < 45.0
+        # LEFT
+        if device_right_to_world_up_angle > 90.0:
+            sky_direction_scene = "LEFT"
+            cam_to_rotated_q = quaternion.from_rotation_vector(
+                [0.0, 0.0, math.pi / 2.0]
+            )
+        else:
+            # note that in metadata.csv RIGHT does not exist, but again it's not accurate...
+            # well, turns out there are scenes oriented like this
+            # for example Training/41124801
+            sky_direction_scene = "RIGHT"
+            cam_to_rotated_q = quaternion.from_rotation_vector(
+                [0.0, 0.0, -math.pi / 2.0]
+            )
+    else:
+        # right is close to 90
+        assert abs(device_right_to_world_up_angle - 90.0) < 45.0
+        if device_up_to_world_up_angle > 90.0:
+            sky_direction_scene = "DOWN"
+            cam_to_rotated_q = quaternion.from_rotation_vector([0.0, 0.0, math.pi])
+        else:
+            sky_direction_scene = "UP"
+            cam_to_rotated_q = quaternion.quaternion(1, 0, 0, 0)
+    cam_to_rotated = np.eye(4)
+    cam_to_rotated[:3, :3] = quaternion.as_rotation_matrix(cam_to_rotated_q)
+    rotated_to_cam = np.linalg.inv(cam_to_rotated)
+    return sky_direction_scene, rotated_to_cam
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.arkitscenes_dir, args.precomputed_pairs, args.output_dir)
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_arkitscenes_highres.py b/extern/CUT3R/datasets_preprocess/preprocess_arkitscenes_highres.py
new file mode 100644
index 0000000000000000000000000000000000000000..055c0e9c19cb9f52e148704bdfa1053b8ff45861
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_arkitscenes_highres.py
@@ -0,0 +1,409 @@
+import os
+import json
+import os.path as osp
+import decimal
+import argparse
+import math
+from bisect import bisect_left
+from PIL import Image
+import numpy as np
+import quaternion
+from scipy import interpolate
+import cv2
+from tqdm import tqdm
+from multiprocessing import Pool
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--arkitscenes_dir",
+        default="",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="data/dust3r_data/processed_arkitscenes_highres",
+    )
+    return parser
+
+
+def value_to_decimal(value, decimal_places):
+    decimal.getcontext().rounding = decimal.ROUND_HALF_UP  # define rounding method
+    return decimal.Decimal(str(float(value))).quantize(
+        decimal.Decimal("1e-{}".format(decimal_places))
+    )
+
+
+def closest(value, sorted_list):
+    index = bisect_left(sorted_list, value)
+    if index == 0:
+        return sorted_list[0]
+    elif index == len(sorted_list):
+        return sorted_list[-1]
+    else:
+        value_before = sorted_list[index - 1]
+        value_after = sorted_list[index]
+        if value_after - value < value - value_before:
+            return value_after
+        else:
+            return value_before
+
+
+def get_up_vectors(pose_device_to_world):
+    return np.matmul(pose_device_to_world, np.array([[0.0], [-1.0], [0.0], [0.0]]))
+
+
+def get_right_vectors(pose_device_to_world):
+    return np.matmul(pose_device_to_world, np.array([[1.0], [0.0], [0.0], [0.0]]))
+
+
+def read_traj(traj_path):
+    quaternions = []
+    poses = []
+    timestamps = []
+    poses_p_to_w = []
+    with open(traj_path) as f:
+        traj_lines = f.readlines()
+        for line in traj_lines:
+            tokens = line.split()
+            assert len(tokens) == 7
+            traj_timestamp = float(tokens[0])
+
+            timestamps_decimal_value = value_to_decimal(traj_timestamp, 3)
+            timestamps.append(
+                float(timestamps_decimal_value)
+            )  # for spline interpolation
+
+            angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])]
+            r_w_to_p, _ = cv2.Rodrigues(np.asarray(angle_axis))
+            t_w_to_p = np.asarray(
+                [float(tokens[4]), float(tokens[5]), float(tokens[6])]
+            )
+
+            pose_w_to_p = np.eye(4)
+            pose_w_to_p[:3, :3] = r_w_to_p
+            pose_w_to_p[:3, 3] = t_w_to_p
+
+            pose_p_to_w = np.linalg.inv(pose_w_to_p)
+
+            r_p_to_w_as_quat = quaternion.from_rotation_matrix(pose_p_to_w[:3, :3])
+            t_p_to_w = pose_p_to_w[:3, 3]
+            poses_p_to_w.append(pose_p_to_w)
+            poses.append(t_p_to_w)
+            quaternions.append(r_p_to_w_as_quat)
+    return timestamps, poses, quaternions, poses_p_to_w
+
+
+def main(rootdir, outdir):
+    os.makedirs(outdir, exist_ok=True)
+    subdirs = ["Validation", "Training"]
+    for subdir in subdirs:
+        outsubdir = osp.join(outdir, subdir)
+        scene_dirs = sorted(
+            [
+                d
+                for d in os.listdir(osp.join(rootdir, subdir))
+                if osp.isdir(osp.join(rootdir, subdir, d))
+            ]
+        )
+
+        with Pool() as pool:
+            results = list(
+                tqdm(
+                    pool.imap(
+                        process_scene,
+                        [
+                            (rootdir, outdir, subdir, scene_subdir)
+                            for scene_subdir in scene_dirs
+                        ],
+                    ),
+                    total=len(scene_dirs),
+                )
+            )
+
+        # Filter None results and other post-processing
+        valid_scenes = [result for result in results if result is not None]
+        outlistfile = osp.join(outsubdir, "scene_list.json")
+        with open(outlistfile, "w") as f:
+            json.dump(valid_scenes, f)
+
+
+def process_scene(args):
+    rootdir, outdir, subdir, scene_subdir = args
+    # Unpack paths
+    scene_dir = osp.join(rootdir, subdir, scene_subdir)
+    outsubdir = osp.join(outdir, subdir)
+    out_scene_subdir = osp.join(outsubdir, scene_subdir)
+
+    # Validation if necessary resources exist
+    if (
+        not osp.exists(osp.join(scene_dir, "highres_depth"))
+        or not osp.exists(osp.join(scene_dir, "vga_wide"))
+        or not osp.exists(osp.join(scene_dir, "vga_wide_intrinsics"))
+        or not osp.exists(osp.join(scene_dir, "lowres_wide.traj"))
+    ):
+        return None
+
+    depth_dir = osp.join(scene_dir, "highres_depth")
+    rgb_dir = osp.join(scene_dir, "vga_wide")
+    intrinsics_dir = osp.join(scene_dir, "vga_wide_intrinsics")
+    traj_path = osp.join(scene_dir, "lowres_wide.traj")
+
+    depth_files = sorted(os.listdir(depth_dir))
+    img_files = sorted(os.listdir(rgb_dir))
+
+    out_scene_subdir = osp.join(outsubdir, scene_subdir)
+
+    # STEP 3: parse the scene and export the list of valid (K, pose, rgb, depth) and convert images
+    scene_metadata_path = osp.join(out_scene_subdir, "scene_metadata.npz")
+    if osp.isfile(scene_metadata_path):
+        print(f"Skipping {scene_subdir}")
+    else:
+        print(f"parsing {scene_subdir}")
+        # loads traj
+        timestamps, poses, quaternions, poses_cam_to_world = read_traj(traj_path)
+
+        poses = np.array(poses)
+        quaternions = np.array(quaternions, dtype=np.quaternion)
+        quaternions = quaternion.unflip_rotors(quaternions)
+        timestamps = np.array(timestamps)
+
+        all_depths = sorted(
+            [
+                (basename, basename.split(".png")[0].split("_")[1])
+                for basename in depth_files
+            ],
+            key=lambda x: float(x[1]),
+        )
+
+        selected_depths = []
+        timestamps_selected = []
+        timestamp_min = timestamps.min()
+        timestamp_max = timestamps.max()
+        for basename, frame_id in all_depths:
+            frame_id = float(frame_id)
+            if frame_id < timestamp_min or frame_id > timestamp_max:
+                continue
+            selected_depths.append((basename, frame_id))
+            timestamps_selected.append(frame_id)
+
+        sky_direction_scene, trajectories, intrinsics, images, depths = (
+            convert_scene_metadata(
+                scene_subdir,
+                intrinsics_dir,
+                timestamps,
+                quaternions,
+                poses,
+                poses_cam_to_world,
+                img_files,
+                selected_depths,
+                timestamps_selected,
+            )
+        )
+
+        if len(images) == 0:
+            print(f"Skipping {scene_subdir}")
+            return None
+
+        os.makedirs(out_scene_subdir, exist_ok=True)
+
+        os.makedirs(os.path.join(out_scene_subdir, "vga_wide"), exist_ok=True)
+        os.makedirs(os.path.join(out_scene_subdir, "highres_depth"), exist_ok=True)
+        assert isinstance(sky_direction_scene, str)
+
+        for image_path, depth_path in zip(images, depths):
+            img_out = os.path.join(
+                out_scene_subdir, "vga_wide", image_path.replace(".png", ".jpg")
+            )
+            depth_out = os.path.join(out_scene_subdir, "highres_depth", depth_path)
+            if osp.isfile(img_out) and osp.isfile(depth_out):
+                continue
+
+            vga_wide_path = osp.join(rgb_dir, image_path)
+            depth_path = osp.join(depth_dir, depth_path)
+
+            if not osp.isfile(vga_wide_path) or not osp.isfile(depth_path):
+                continue
+
+            img = Image.open(vga_wide_path)
+            depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
+
+            # rotate the image
+            if sky_direction_scene == "RIGHT":
+                try:
+                    img = img.transpose(Image.Transpose.ROTATE_90)
+                except Exception:
+                    img = img.transpose(Image.ROTATE_90)
+                depth = cv2.rotate(depth, cv2.ROTATE_90_COUNTERCLOCKWISE)
+
+            elif sky_direction_scene == "LEFT":
+                try:
+                    img = img.transpose(Image.Transpose.ROTATE_270)
+                except Exception:
+                    img = img.transpose(Image.ROTATE_270)
+                depth = cv2.rotate(depth, cv2.ROTATE_90_CLOCKWISE)
+
+            elif sky_direction_scene == "DOWN":
+                try:
+                    img = img.transpose(Image.Transpose.ROTATE_180)
+                except Exception:
+                    img = img.transpose(Image.ROTATE_180)
+                depth = cv2.rotate(depth, cv2.ROTATE_180)
+
+            W, H = img.size
+            if not osp.isfile(img_out):
+                img.save(img_out)
+
+            depth = cv2.resize(depth, (W, H), interpolation=cv2.INTER_NEAREST)
+            if not osp.isfile(
+                depth_out
+            ):  # avoid destroying the base dataset when you mess up the paths
+                cv2.imwrite(depth_out, depth)
+
+        # save at the end
+        np.savez(
+            scene_metadata_path,
+            trajectories=trajectories,
+            intrinsics=intrinsics,
+            images=images,
+        )
+
+
+def convert_scene_metadata(
+    scene_subdir,
+    intrinsics_dir,
+    timestamps,
+    quaternions,
+    poses,
+    poses_cam_to_world,
+    all_images,
+    selected_depths,
+    timestamps_selected,
+):
+    # find scene orientation
+    sky_direction_scene, rotated_to_cam = find_scene_orientation(poses_cam_to_world)
+
+    # find/compute pose for selected timestamps
+    # most images have a valid timestamp / exact pose associated
+    timestamps_selected = np.array(timestamps_selected)
+    spline = interpolate.interp1d(timestamps, poses, kind="linear", axis=0)
+    interpolated_rotations = quaternion.squad(
+        quaternions, timestamps, timestamps_selected
+    )
+    interpolated_positions = spline(timestamps_selected)
+
+    trajectories = []
+    intrinsics = []
+    images = []
+    depths = []
+    for i, (basename, frame_id) in enumerate(selected_depths):
+        intrinsic_fn = osp.join(intrinsics_dir, f"{scene_subdir}_{frame_id}.pincam")
+        search_interval = int(0.1 / 0.001)
+        for timestamp in range(-search_interval, search_interval + 1):
+            if osp.exists(intrinsic_fn):
+                break
+            intrinsic_fn = osp.join(
+                intrinsics_dir,
+                f"{scene_subdir}_{float(frame_id) + timestamp * 0.001:.3f}.pincam",
+            )
+        if not osp.exists(intrinsic_fn):
+            print(f"Skipping {intrinsic_fn}")
+            continue
+
+        image_path = "{}_{}.png".format(scene_subdir, frame_id)
+        search_interval = int(0.001 / 0.001)
+        for timestamp in range(-search_interval, search_interval + 1):
+            if image_path in all_images:
+                break
+            image_path = "{}_{}.png".format(
+                scene_subdir, float(frame_id) + timestamp * 0.001
+            )
+        if image_path not in all_images:
+            print(f"Skipping {scene_subdir} {frame_id}")
+            continue
+
+        w, h, fx, fy, hw, hh = np.loadtxt(intrinsic_fn)  # PINHOLE
+
+        pose = np.eye(4)
+        pose[:3, :3] = quaternion.as_rotation_matrix(interpolated_rotations[i])
+        pose[:3, 3] = interpolated_positions[i]
+
+        images.append(basename)
+        depths.append(basename)
+        if sky_direction_scene == "RIGHT" or sky_direction_scene == "LEFT":
+            intrinsics.append([h, w, fy, fx, hh, hw])  # swapped intrinsics
+        else:
+            intrinsics.append([w, h, fx, fy, hw, hh])
+        trajectories.append(
+            pose @ rotated_to_cam
+        )  # pose_cam_to_world @ rotated_to_cam = rotated(cam) to world
+
+    return sky_direction_scene, trajectories, intrinsics, images, depths
+
+
+def find_scene_orientation(poses_cam_to_world):
+    if len(poses_cam_to_world) > 0:
+        up_vector = sum(get_up_vectors(p) for p in poses_cam_to_world) / len(
+            poses_cam_to_world
+        )
+        right_vector = sum(get_right_vectors(p) for p in poses_cam_to_world) / len(
+            poses_cam_to_world
+        )
+        up_world = np.array([[0.0], [0.0], [1.0], [0.0]])
+    else:
+        up_vector = np.array([[0.0], [-1.0], [0.0], [0.0]])
+        right_vector = np.array([[1.0], [0.0], [0.0], [0.0]])
+        up_world = np.array([[0.0], [0.0], [1.0], [0.0]])
+
+    # value between 0, 180
+    device_up_to_world_up_angle = (
+        np.arccos(np.clip(np.dot(np.transpose(up_world), up_vector), -1.0, 1.0)).item()
+        * 180.0
+        / np.pi
+    )
+    device_right_to_world_up_angle = (
+        np.arccos(
+            np.clip(np.dot(np.transpose(up_world), right_vector), -1.0, 1.0)
+        ).item()
+        * 180.0
+        / np.pi
+    )
+
+    up_closest_to_90 = abs(device_up_to_world_up_angle - 90.0) < abs(
+        device_right_to_world_up_angle - 90.0
+    )
+    if up_closest_to_90:
+        assert abs(device_up_to_world_up_angle - 90.0) < 45.0
+        # LEFT
+        if device_right_to_world_up_angle > 90.0:
+            sky_direction_scene = "LEFT"
+            cam_to_rotated_q = quaternion.from_rotation_vector(
+                [0.0, 0.0, math.pi / 2.0]
+            )
+        else:
+            # note that in metadata.csv RIGHT does not exist, but again it's not accurate...
+            # well, turns out there are scenes oriented like this
+            # for example Training/41124801
+            sky_direction_scene = "RIGHT"
+            cam_to_rotated_q = quaternion.from_rotation_vector(
+                [0.0, 0.0, -math.pi / 2.0]
+            )
+    else:
+        # right is close to 90
+        assert abs(device_right_to_world_up_angle - 90.0) < 45.0
+        if device_up_to_world_up_angle > 90.0:
+            sky_direction_scene = "DOWN"
+            cam_to_rotated_q = quaternion.from_rotation_vector([0.0, 0.0, math.pi])
+        else:
+            sky_direction_scene = "UP"
+            cam_to_rotated_q = quaternion.quaternion(1, 0, 0, 0)
+    cam_to_rotated = np.eye(4)
+    cam_to_rotated[:3, :3] = quaternion.as_rotation_matrix(cam_to_rotated_q)
+    rotated_to_cam = np.linalg.inv(cam_to_rotated)
+    return sky_direction_scene, rotated_to_cam
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.arkitscenes_dir, args.output_dir)
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_bedlam.py b/extern/CUT3R/datasets_preprocess/preprocess_bedlam.py
new file mode 100644
index 0000000000000000000000000000000000000000..436fc3b30bbb17d348611c0770790958ca55be67
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_bedlam.py
@@ -0,0 +1,402 @@
+#!/usr/bin/env python3
+"""
+Process Bedlam scenes by computing camera intrinsics and extrinsics
+from extracted data. The script reads per-scene CSV and image/depth files,
+computes the necessary camera parameters, and saves the resulting camera
+files (as .npz files) in an output directory.
+
+Usage:
+    python preprocess_bedlam.py --root /path/to/extracted_data \
+                             --outdir /path/to/processed_bedlam \
+                             [--num_workers 4]
+"""
+
+import os
+import cv2
+import numpy as np
+import pandas as pd
+from glob import glob
+import shutil
+import OpenEXR  # Ensure OpenEXR is installed
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from tqdm import tqdm
+import argparse
+
+# Enable OpenEXR support in OpenCV.
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+
+# Global constants
+IMG_FORMAT = ".png"
+rotate_flag = False
+SENSOR_W = 36
+SENSOR_H = 20.25
+IMG_W = 1280
+IMG_H = 720
+
+# -----------------------------------------------------------------------------
+# Helper functions for camera parameter conversion
+# -----------------------------------------------------------------------------
+
+
+def focalLength_mm2px(focalLength, dslr_sens, focalPoint):
+    focal_pixel = (focalLength / dslr_sens) * focalPoint * 2
+    return focal_pixel
+
+
+def get_cam_int(fl, sens_w, sens_h, cx, cy):
+    flx = focalLength_mm2px(fl, sens_w, cx)
+    fly = focalLength_mm2px(fl, sens_h, cy)
+    cam_mat = np.array([[flx, 0, cx], [0, fly, cy], [0, 0, 1]])
+    return cam_mat
+
+
+def unreal2cv2(points):
+    # Permute coordinates: x --> y, y --> z, z --> x
+    points = np.roll(points, 2, axis=1)
+    # Invert the y-axis
+    points = points * np.array([1.0, -1.0, 1.0])
+    return points
+
+
+def get_cam_trans(body_trans, cam_trans):
+    cam_trans = np.array(cam_trans) / 100
+    cam_trans = unreal2cv2(np.reshape(cam_trans, (1, 3)))
+    body_trans = np.array(body_trans) / 100
+    body_trans = unreal2cv2(np.reshape(body_trans, (1, 3)))
+    trans = body_trans - cam_trans
+    return trans
+
+
+def get_cam_rotmat(pitch, yaw, roll):
+    rotmat_yaw, _ = cv2.Rodrigues(np.array([[0, (yaw / 180) * np.pi, 0]], dtype=float))
+    rotmat_pitch, _ = cv2.Rodrigues(np.array([pitch / 180 * np.pi, 0, 0]).reshape(3, 1))
+    rotmat_roll, _ = cv2.Rodrigues(np.array([0, 0, roll / 180 * np.pi]).reshape(3, 1))
+    final_rotmat = rotmat_roll @ (rotmat_pitch @ rotmat_yaw)
+    return final_rotmat
+
+
+def get_global_orient(cam_pitch, cam_yaw, cam_roll):
+    pitch_rotmat, _ = cv2.Rodrigues(
+        np.array([cam_pitch / 180 * np.pi, 0, 0]).reshape(3, 1)
+    )
+    roll_rotmat, _ = cv2.Rodrigues(
+        np.array([0, 0, cam_roll / 180 * np.pi]).reshape(3, 1)
+    )
+    final_rotmat = roll_rotmat @ pitch_rotmat
+    return final_rotmat
+
+
+def convert_translation_to_opencv(x, y, z):
+    t_cv = np.array([y, -z, x])
+    return t_cv
+
+
+def rotation_matrix_unreal(yaw, pitch, roll):
+    yaw_rad = np.deg2rad(yaw)
+    pitch_rad = np.deg2rad(pitch)
+    roll_rad = np.deg2rad(roll)
+    # Yaw (left-handed)
+    R_yaw = np.array(
+        [
+            [np.cos(-yaw_rad), -np.sin(-yaw_rad), 0],
+            [np.sin(-yaw_rad), np.cos(-yaw_rad), 0],
+            [0, 0, 1],
+        ]
+    )
+    # Pitch (right-handed)
+    R_pitch = np.array(
+        [
+            [np.cos(pitch_rad), 0, np.sin(pitch_rad)],
+            [0, 1, 0],
+            [-np.sin(pitch_rad), 0, np.cos(pitch_rad)],
+        ]
+    )
+    # Roll (right-handed)
+    R_roll = np.array(
+        [
+            [1, 0, 0],
+            [0, np.cos(roll_rad), -np.sin(roll_rad)],
+            [0, np.sin(roll_rad), np.cos(roll_rad)],
+        ]
+    )
+    R_unreal = R_roll @ R_pitch @ R_yaw
+    return R_unreal
+
+
+def convert_rotation_to_opencv(R_unreal):
+    # Transformation matrix from Unreal to OpenCV coordinate system.
+    C = np.array([[0, 1, 0], [0, 0, -1], [1, 0, 0]])
+    R_cv = C @ R_unreal @ C.T
+    return R_cv
+
+
+def get_rot_unreal(yaw, pitch, roll):
+    yaw_rad = np.deg2rad(yaw)
+    pitch_rad = np.deg2rad(pitch)
+    roll_rad = np.deg2rad(roll)
+    R_yaw = np.array(
+        [
+            [np.cos(yaw_rad), -np.sin(yaw_rad), 0],
+            [np.sin(yaw_rad), np.cos(yaw_rad), 0],
+            [0, 0, 1],
+        ]
+    )
+    R_pitch = np.array(
+        [
+            [np.cos(pitch_rad), 0, -np.sin(pitch_rad)],
+            [0, 1, 0],
+            [np.sin(pitch_rad), 0, np.cos(pitch_rad)],
+        ]
+    )
+    R_roll = np.array(
+        [
+            [1, 0, 0],
+            [0, np.cos(roll_rad), np.sin(roll_rad)],
+            [0, -np.sin(roll_rad), np.cos(roll_rad)],
+        ]
+    )
+    R_unreal = R_yaw @ R_pitch @ R_roll
+    return R_unreal
+
+
+def get_extrinsics_unreal(R_unreal, t_unreal):
+    cam_trans = np.array(t_unreal)
+    ext = np.eye(4)
+    ext[:3, :3] = R_unreal
+    ext[:3, 3] = cam_trans.reshape(1, 3)
+    return ext
+
+
+def get_extrinsics_opencv(yaw, pitch, roll, x, y, z):
+    R_unreal = get_rot_unreal(yaw, pitch, roll)
+    t_unreal = np.array([x / 100.0, y / 100.0, z / 100.0])
+    T_u2wu = get_extrinsics_unreal(R_unreal, t_unreal)
+    T_opencv2unreal = np.array(
+        [[0, 0, -1, 0], [1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]], dtype=np.float32
+    )
+    T_wu2ou = np.array(
+        [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32
+    )
+    return np.linalg.inv(T_opencv2unreal @ T_u2wu @ T_wu2ou)
+
+
+# -----------------------------------------------------------------------------
+# Get camera parameters from the extracted images and CSV data.
+# -----------------------------------------------------------------------------
+
+
+def get_params(
+    image_folder,
+    fl,
+    trans_body,
+    cam_x,
+    cam_y,
+    cam_z,
+    fps,
+    cam_pitch_,
+    cam_roll_,
+    cam_yaw_,
+):
+    all_images = sorted(glob(os.path.join(image_folder, "*" + IMG_FORMAT)))
+    imgnames, cam_ext, cam_int = [], [], []
+
+    for img_ind, image_path in enumerate(all_images):
+        # Process every 5th frame.
+        if img_ind % 5 != 0:
+            continue
+        cam_ind = img_ind
+
+        cam_pitch_ind = cam_pitch_[cam_ind]
+        cam_yaw_ind = cam_yaw_[cam_ind]
+        cam_roll_ind = cam_roll_[cam_ind]
+
+        CAM_INT = get_cam_int(fl[cam_ind], SENSOR_W, SENSOR_H, IMG_W / 2.0, IMG_H / 2.0)
+
+        rot_unreal = rotation_matrix_unreal(cam_yaw_ind, cam_pitch_ind, cam_roll_ind)
+        rot_cv = convert_rotation_to_opencv(rot_unreal)
+        trans_cv = convert_translation_to_opencv(
+            cam_x[cam_ind] / 100.0, cam_y[cam_ind] / 100.0, cam_z[cam_ind] / 100.0
+        )
+        cam_ext_ = np.eye(4)
+        cam_ext_[:3, :3] = rot_cv
+        # The camera pose is computed as the inverse of the transformed translation.
+        cam_ext_[:3, 3] = -rot_cv @ trans_cv
+
+        imgnames.append(
+            os.path.join(image_path.split("/")[-2], image_path.split("/")[-1])
+        )
+        cam_ext.append(cam_ext_)
+        cam_int.append(CAM_INT)
+    return imgnames, cam_ext, cam_int
+
+
+# -----------------------------------------------------------------------------
+# Processing per sequence.
+# -----------------------------------------------------------------------------
+
+
+def process_seq(args):
+    """
+    Process a single sequence task. For each image, load the corresponding
+    depth and image files, and save the computed camera intrinsics and the inverse
+    of the extrinsic matrix (i.e. the camera pose in world coordinates) as an NPZ file.
+    """
+    (
+        scene,
+        seq_name,
+        outdir,
+        image_folder_base,
+        depth_folder_base,
+        imgnames,
+        cam_ext,
+        cam_int,
+    ) = args
+
+    out_rgb_dir = os.path.join(outdir, '_'.join([scene, seq_name]), 'rgb')
+    out_depth_dir = os.path.join(outdir, '_'.join([scene, seq_name]), 'depth')
+    out_cam_dir = os.path.join(outdir, "_".join([scene, seq_name]), "cam")
+    os.makedirs(out_rgb_dir, exist_ok=True)
+    os.makedirs(out_depth_dir, exist_ok=True)
+    os.makedirs(out_cam_dir, exist_ok=True)
+
+    assert (
+        len(imgnames) == len(cam_ext) == len(cam_int)
+    ), f"Inconsistent lengths for {scene}_{seq_name}"
+    for imgname, ext, intr in zip(imgnames, cam_ext, cam_int):
+        depthname = imgname.replace(".png", "_depth.exr")
+        imgpath = os.path.join(image_folder_base, imgname)
+        depthpath = os.path.join(depth_folder_base, depthname)
+        depth= OpenEXR.File(depthpath).parts[0].channels['Depth'].pixels
+        depth = depth.astype(np.float32)/100.0
+        
+        outimg_path = os.path.join(out_rgb_dir, os.path.basename(imgpath))
+        outdepth_path = os.path.join(out_depth_dir, os.path.basename(imgpath).replace('.png','.npy'))
+        outcam_path = os.path.join(
+            out_cam_dir, os.path.basename(imgpath).replace(".png", ".npz")
+        )
+
+        shutil.copy(imgpath, outimg_path)
+        np.save(outdepth_path, depth)
+        np.savez(outcam_path, intrinsics=intr, pose=np.linalg.inv(ext))
+    return None
+
+
+# -----------------------------------------------------------------------------
+# Main entry point.
+# -----------------------------------------------------------------------------
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Process Bedlam scenes: compute camera intrinsics and extrinsics, "
+        "and save processed camera files."
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        required=True,
+        help="Root directory of the extracted data (scenes).",
+    )
+    parser.add_argument(
+        "--outdir", type=str, required=True, help="Output directory for processed data."
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=None,
+        help="Number of worker processes (default: os.cpu_count()//2).",
+    )
+    args = parser.parse_args()
+
+    root = args.root
+    outdir = args.outdir
+    num_workers = (
+        args.num_workers if args.num_workers is not None else (os.cpu_count() or 4) // 2
+    )
+
+    # Get scene directories from the root folder.
+    scenes = sorted(
+        [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
+    )
+    # Exclude HDRI scenes.
+    hdri_scenes = [
+        "20221010_3_1000_batch01hand",
+        "20221017_3_1000_batch01hand",
+        "20221018_3-8_250_batch01hand",
+        "20221019_3_250_highbmihand",
+    ]
+    scenes = np.setdiff1d(scenes, hdri_scenes)
+
+    tasks = []
+    for scene in tqdm(scenes, desc="Collecting tasks"):
+        # Skip closeup scenes.
+        if "closeup" in scene:
+            continue
+        base_folder = os.path.join(root, scene)
+        image_folder_base = os.path.join(root, scene, "png")
+        depth_folder_base = os.path.join(root, scene, "depth")
+        csv_path = os.path.join(base_folder, "be_seq.csv")
+        if not os.path.exists(csv_path):
+            continue
+        csv_data = pd.read_csv(csv_path)
+        csv_data = csv_data.to_dict("list")
+        cam_csv_base = os.path.join(base_folder, "ground_truth", "camera")
+
+        # Look for a row in the CSV with a "sequence_name" comment.
+        for idx, comment in enumerate(csv_data.get("Comment", [])):
+            if "sequence_name" in comment:
+                seq_name = comment.split(";")[0].split("=")[-1]
+                cam_csv_path = os.path.join(cam_csv_base, seq_name + "_camera.csv")
+                if not os.path.exists(cam_csv_path):
+                    continue
+                cam_csv_data = pd.read_csv(cam_csv_path)
+                cam_csv_data = cam_csv_data.to_dict("list")
+                cam_x = cam_csv_data["x"]
+                cam_y = cam_csv_data["y"]
+                cam_z = cam_csv_data["z"]
+                cam_yaw_ = cam_csv_data["yaw"]
+                cam_pitch_ = cam_csv_data["pitch"]
+                cam_roll_ = cam_csv_data["roll"]
+                fl = cam_csv_data["focal_length"]
+                image_folder = os.path.join(image_folder_base, seq_name)
+                trans_body = None  # Not used here.
+                imgnames, cam_ext, cam_int = get_params(
+                    image_folder,
+                    fl,
+                    trans_body,
+                    cam_x,
+                    cam_y,
+                    cam_z,
+                    6,
+                    cam_pitch_=cam_pitch_,
+                    cam_roll_=cam_roll_,
+                    cam_yaw_=cam_yaw_,
+                )
+                tasks.append(
+                    (
+                        scene,
+                        seq_name,
+                        outdir,
+                        image_folder_base,
+                        depth_folder_base,
+                        imgnames,
+                        cam_ext,
+                        cam_int,
+                    )
+                )
+                # Process only the first valid sequence for this scene.
+                break
+
+    # Process each task in parallel.
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        futures = {executor.submit(process_seq, task): task for task in tasks}
+        for future in tqdm(
+            as_completed(futures), total=len(futures), desc="Processing sequences"
+        ):
+            error = future.result()
+            if error:
+                print(error)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_blendedmvs.py b/extern/CUT3R/datasets_preprocess/preprocess_blendedmvs.py
new file mode 100644
index 0000000000000000000000000000000000000000..099f2d62ed909f177a468d977778569636164b28
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_blendedmvs.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Preprocessing code for the BlendedMVS dataset
+# dataset at https://github.com/YoYo000/BlendedMVS
+# 1) Download BlendedMVS.zip
+# 2) Download BlendedMVS+.zip
+# 3) Download BlendedMVS++.zip
+# 4) Unzip everything in the same /path/to/tmp/blendedMVS/ directory
+# 5) python datasets_preprocess/preprocess_blendedMVS.py --blendedmvs_dir /path/to/tmp/blendedMVS/
+# --------------------------------------------------------
+import os
+import os.path as osp
+import re
+from tqdm import tqdm
+import numpy as np
+
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2
+
+import path_to_root  # noqa
+from datasets_preprocess.utils.parallel import parallel_threads
+from datasets_preprocess.utils import cropping  # noqa
+
+
+def get_parser():
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--blendedmvs_dir", required=True)
+    parser.add_argument("--precomputed_pairs", required=True)
+    parser.add_argument("--output_dir", default="data/blendedmvs_processed")
+    return parser
+
+
+def main(db_root, pairs_path, output_dir):
+    print(">> Listing all sequences")
+    sequences = [f for f in os.listdir(db_root) if len(f) == 24]
+    # should find 502 scenes
+    assert sequences, f"did not found any sequences at {db_root}"
+    print(f"   (found {len(sequences)} sequences)")
+
+    for i, seq in enumerate(tqdm(sequences)):
+        out_dir = osp.join(output_dir, seq)
+        os.makedirs(out_dir, exist_ok=True)
+
+        # generate the crops
+        root = osp.join(db_root, seq)
+        cam_dir = osp.join(root, "cams")
+        func_args = [
+            (root, f[:-8], out_dir)
+            for f in os.listdir(cam_dir)
+            if not f.startswith("pair")
+        ]
+        parallel_threads(load_crop_and_save, func_args, star_args=True, leave=False)
+
+    # verify that all pairs are there
+    pairs = np.load(pairs_path)
+    for seqh, seql, img1, img2, score in tqdm(pairs):
+        for view_index in [img1, img2]:
+            impath = osp.join(
+                output_dir, f"{seqh:08x}{seql:016x}", f"{view_index:08n}.jpg"
+            )
+            assert osp.isfile(impath), f"missing image at {impath=}"
+
+    print(f">> Done, saved everything in {output_dir}/")
+
+
+def load_crop_and_save(root, img, out_dir):
+    if osp.isfile(osp.join(out_dir, img + ".npz")):
+        return  # already done
+
+    # load everything
+    intrinsics_in, R_camin2world, t_camin2world = _load_pose(
+        osp.join(root, "cams", img + "_cam.txt")
+    )
+    color_image_in = cv2.cvtColor(
+        cv2.imread(osp.join(root, "blended_images", img + ".jpg"), cv2.IMREAD_COLOR),
+        cv2.COLOR_BGR2RGB,
+    )
+    depthmap_in = load_pfm_file(osp.join(root, "rendered_depth_maps", img + ".pfm"))
+
+    # do the crop
+    H, W = color_image_in.shape[:2]
+    assert H * 4 == W * 3
+    image, depthmap, intrinsics_out, R_in2out = _crop_image(
+        intrinsics_in, color_image_in, depthmap_in, (512, 384)
+    )
+
+    # write everything
+    image.save(osp.join(out_dir, img + ".jpg"), quality=80)
+    cv2.imwrite(osp.join(out_dir, img + ".exr"), depthmap)
+
+    # New camera parameters
+    R_camout2world = R_camin2world @ R_in2out.T
+    t_camout2world = t_camin2world
+    np.savez(
+        osp.join(out_dir, img + ".npz"),
+        intrinsics=intrinsics_out,
+        R_cam2world=R_camout2world,
+        t_cam2world=t_camout2world,
+    )
+
+
+def _crop_image(intrinsics_in, color_image_in, depthmap_in, resolution_out=(800, 800)):
+    image, depthmap, intrinsics_out = cropping.rescale_image_depthmap(
+        color_image_in, depthmap_in, intrinsics_in, resolution_out
+    )
+    R_in2out = np.eye(3)
+    return image, depthmap, intrinsics_out, R_in2out
+
+
+def _load_pose(path, ret_44=False):
+    f = open(path)
+    RT = np.loadtxt(f, skiprows=1, max_rows=4, dtype=np.float32)
+    assert RT.shape == (4, 4)
+    RT = np.linalg.inv(RT)  # world2cam to cam2world
+
+    K = np.loadtxt(f, skiprows=2, max_rows=3, dtype=np.float32)
+    assert K.shape == (3, 3)
+
+    if ret_44:
+        return K, RT
+    return K, RT[:3, :3], RT[:3, 3]  # , depth_uint8_to_f32
+
+
+def load_pfm_file(file_path):
+    with open(file_path, "rb") as file:
+        header = file.readline().decode("UTF-8").strip()
+
+        if header == "PF":
+            is_color = True
+        elif header == "Pf":
+            is_color = False
+        else:
+            raise ValueError("The provided file is not a valid PFM file.")
+
+        dimensions = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("UTF-8"))
+        if dimensions:
+            img_width, img_height = map(int, dimensions.groups())
+        else:
+            raise ValueError("Invalid PFM header format.")
+
+        endian_scale = float(file.readline().decode("UTF-8").strip())
+        if endian_scale < 0:
+            dtype = "<f"  # little-endian
+        else:
+            dtype = ">f"  # big-endian
+
+        data_buffer = file.read()
+        img_data = np.frombuffer(data_buffer, dtype=dtype)
+
+        if is_color:
+            img_data = np.reshape(img_data, (img_height, img_width, 3))
+        else:
+            img_data = np.reshape(img_data, (img_height, img_width))
+
+        img_data = cv2.flip(img_data, 0)
+
+    return img_data
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.blendedmvs_dir, args.precomputed_pairs, args.output_dir)
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_co3d.py b/extern/CUT3R/datasets_preprocess/preprocess_co3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..15938d6b0ed276839d350389fe2075649ef7de46
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_co3d.py
@@ -0,0 +1,391 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Script to pre-process the CO3D dataset.
+# Usage:
+# python3 datasets_preprocess/preprocess_co3d.py --co3d_dir /path/to/co3d
+# --------------------------------------------------------
+
+import argparse
+import random
+import gzip
+import json
+import os
+import os.path as osp
+
+import torch
+import PIL.Image
+import numpy as np
+import cv2
+
+from tqdm.auto import tqdm
+import matplotlib.pyplot as plt
+
+import path_to_root  # noqa
+import datasets_preprocess.utils.cropping as cropping  # noqa
+
+
+CATEGORIES = [
+    "apple",
+    "backpack",
+    "ball",
+    "banana",
+    "baseballbat",
+    "baseballglove",
+    "bench",
+    "bicycle",
+    "book",
+    "bottle",
+    "bowl",
+    "broccoli",
+    "cake",
+    "car",
+    "carrot",
+    "cellphone",
+    "chair",
+    "couch",
+    "cup",
+    "donut",
+    "frisbee",
+    "hairdryer",
+    "handbag",
+    "hotdog",
+    "hydrant",
+    "keyboard",
+    "kite",
+    "laptop",
+    "microwave",
+    "motorcycle",
+    "mouse",
+    "orange",
+    "parkingmeter",
+    "pizza",
+    "plant",
+    "remote",
+    "sandwich",
+    "skateboard",
+    "stopsign",
+    "suitcase",
+    "teddybear",
+    "toaster",
+    "toilet",
+    "toybus",
+    "toyplane",
+    "toytrain",
+    "toytruck",
+    "tv",
+    "umbrella",
+    "vase",
+    "wineglass",
+]
+CATEGORIES_IDX = {cat: i for i, cat in enumerate(CATEGORIES)}  # for seeding
+
+SINGLE_SEQUENCE_CATEGORIES = sorted(
+    set(CATEGORIES) - set(["microwave", "stopsign", "tv"])
+)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--category", type=str, default=None)
+    parser.add_argument(
+        "--single_sequence_subset",
+        default=False,
+        action="store_true",
+        help="prepare the single_sequence_subset instead.",
+    )
+    parser.add_argument("--output_dir", type=str, default="data/co3d_processed")
+    parser.add_argument("--co3d_dir", type=str, required=True)
+    parser.add_argument("--num_sequences_per_object", type=int, default=50)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument(
+        "--min_quality",
+        type=float,
+        default=0.5,
+        help="Minimum viewpoint quality score.",
+    )
+
+    parser.add_argument(
+        "--img_size",
+        type=int,
+        default=512,
+        help=(
+            "lower dimension will be >= img_size * 3/4, and max dimension will be >= img_size"
+        ),
+    )
+    return parser
+
+
+def convert_ndc_to_pinhole(focal_length, principal_point, image_size):
+    focal_length = np.array(focal_length)
+    principal_point = np.array(principal_point)
+    image_size_wh = np.array([image_size[1], image_size[0]])
+    half_image_size = image_size_wh / 2
+    rescale = half_image_size.min()
+    principal_point_px = half_image_size - principal_point * rescale
+    focal_length_px = focal_length * rescale
+    fx, fy = focal_length_px[0], focal_length_px[1]
+    cx, cy = principal_point_px[0], principal_point_px[1]
+    K = np.array([[fx, 0.0, cx], [0.0, fy, cy], [0.0, 0.0, 1.0]], dtype=np.float32)
+    return K
+
+
+def opencv_from_cameras_projection(R, T, focal, p0, image_size):
+    R = torch.from_numpy(R)[None, :, :]
+    T = torch.from_numpy(T)[None, :]
+    focal = torch.from_numpy(focal)[None, :]
+    p0 = torch.from_numpy(p0)[None, :]
+    image_size = torch.from_numpy(image_size)[None, :]
+
+    R_pytorch3d = R.clone()
+    T_pytorch3d = T.clone()
+    focal_pytorch3d = focal
+    p0_pytorch3d = p0
+    T_pytorch3d[:, :2] *= -1
+    R_pytorch3d[:, :, :2] *= -1
+    tvec = T_pytorch3d
+    R = R_pytorch3d.permute(0, 2, 1)
+
+    # Retype the image_size correctly and flip to width, height.
+    image_size_wh = image_size.to(R).flip(dims=(1,))
+
+    # NDC to screen conversion.
+    scale = image_size_wh.to(R).min(dim=1, keepdim=True)[0] / 2.0
+    scale = scale.expand(-1, 2)
+    c0 = image_size_wh / 2.0
+
+    principal_point = -p0_pytorch3d * scale + c0
+    focal_length = focal_pytorch3d * scale
+
+    camera_matrix = torch.zeros_like(R)
+    camera_matrix[:, :2, 2] = principal_point
+    camera_matrix[:, 2, 2] = 1.0
+    camera_matrix[:, 0, 0] = focal_length[:, 0]
+    camera_matrix[:, 1, 1] = focal_length[:, 1]
+    return R[0], tvec[0], camera_matrix[0]
+
+
+def get_set_list(category_dir, split, is_single_sequence_subset=False):
+    listfiles = os.listdir(osp.join(category_dir, "set_lists"))
+    if is_single_sequence_subset:
+        # not all objects have manyview_dev
+        subset_list_files = [f for f in listfiles if "manyview_dev" in f]
+    else:
+        subset_list_files = [f for f in listfiles if f"fewview_train" in f]
+
+    sequences_all = []
+    for subset_list_file in subset_list_files:
+        with open(osp.join(category_dir, "set_lists", subset_list_file)) as f:
+            subset_lists_data = json.load(f)
+            sequences_all.extend(subset_lists_data[split])
+
+    return sequences_all
+
+
+def prepare_sequences(
+    category,
+    co3d_dir,
+    output_dir,
+    img_size,
+    split,
+    min_quality,
+    max_num_sequences_per_object,
+    seed,
+    is_single_sequence_subset=False,
+):
+    random.seed(seed)
+    category_dir = osp.join(co3d_dir, category)
+    category_output_dir = osp.join(output_dir, category)
+    sequences_all = get_set_list(category_dir, split, is_single_sequence_subset)
+    sequences_numbers = sorted(set(seq_name for seq_name, _, _ in sequences_all))
+
+    frame_file = osp.join(category_dir, "frame_annotations.jgz")
+    sequence_file = osp.join(category_dir, "sequence_annotations.jgz")
+
+    with gzip.open(frame_file, "r") as fin:
+        frame_data = json.loads(fin.read())
+    with gzip.open(sequence_file, "r") as fin:
+        sequence_data = json.loads(fin.read())
+
+    frame_data_processed = {}
+    for f_data in frame_data:
+        sequence_name = f_data["sequence_name"]
+        frame_data_processed.setdefault(sequence_name, {})[
+            f_data["frame_number"]
+        ] = f_data
+
+    good_quality_sequences = set()
+    for seq_data in sequence_data:
+        if seq_data["viewpoint_quality_score"] > min_quality:
+            good_quality_sequences.add(seq_data["sequence_name"])
+
+    sequences_numbers = [
+        seq_name for seq_name in sequences_numbers if seq_name in good_quality_sequences
+    ]
+    if len(sequences_numbers) < max_num_sequences_per_object:
+        selected_sequences_numbers = sequences_numbers
+    else:
+        selected_sequences_numbers = random.sample(
+            sequences_numbers, max_num_sequences_per_object
+        )
+
+    selected_sequences_numbers_dict = {
+        seq_name: [] for seq_name in selected_sequences_numbers
+    }
+    sequences_all = [
+        (seq_name, frame_number, filepath)
+        for seq_name, frame_number, filepath in sequences_all
+        if seq_name in selected_sequences_numbers_dict
+    ]
+
+    for seq_name, frame_number, filepath in tqdm(sequences_all):
+        frame_idx = int(filepath.split("/")[-1][5:-4])
+        selected_sequences_numbers_dict[seq_name].append(frame_idx)
+        mask_path = filepath.replace("images", "masks").replace(".jpg", ".png")
+        frame_data = frame_data_processed[seq_name][frame_number]
+        focal_length = frame_data["viewpoint"]["focal_length"]
+        principal_point = frame_data["viewpoint"]["principal_point"]
+        image_size = frame_data["image"]["size"]
+        K = convert_ndc_to_pinhole(focal_length, principal_point, image_size)
+        R, tvec, camera_intrinsics = opencv_from_cameras_projection(
+            np.array(frame_data["viewpoint"]["R"]),
+            np.array(frame_data["viewpoint"]["T"]),
+            np.array(focal_length),
+            np.array(principal_point),
+            np.array(image_size),
+        )
+
+        frame_data = frame_data_processed[seq_name][frame_number]
+        depth_path = os.path.join(co3d_dir, frame_data["depth"]["path"])
+        assert frame_data["depth"]["scale_adjustment"] == 1.0
+        image_path = os.path.join(co3d_dir, filepath)
+        mask_path_full = os.path.join(co3d_dir, mask_path)
+
+        input_rgb_image = PIL.Image.open(image_path).convert("RGB")
+        input_mask = plt.imread(mask_path_full)
+
+        with PIL.Image.open(depth_path) as depth_pil:
+            # the image is stored with 16-bit depth but PIL reads it as I (32 bit).
+            # we cast it to uint16, then reinterpret as float16, then cast to float32
+            input_depthmap = (
+                np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16)
+                .astype(np.float32)
+                .reshape((depth_pil.size[1], depth_pil.size[0]))
+            )
+        depth_mask = np.stack((input_depthmap, input_mask), axis=-1)
+        H, W = input_depthmap.shape
+
+        camera_intrinsics = camera_intrinsics.numpy()
+        cx, cy = camera_intrinsics[:2, 2].round().astype(int)
+        min_margin_x = min(cx, W - cx)
+        min_margin_y = min(cy, H - cy)
+
+        # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
+        l, t = cx - min_margin_x, cy - min_margin_y
+        r, b = cx + min_margin_x, cy + min_margin_y
+        crop_bbox = (l, t, r, b)
+        input_rgb_image, depth_mask, input_camera_intrinsics = (
+            cropping.crop_image_depthmap(
+                input_rgb_image, depth_mask, camera_intrinsics, crop_bbox
+            )
+        )
+
+        # try to set the lower dimension to img_size * 3/4 -> img_size=512 => 384
+        scale_final = ((img_size * 3 // 4) / min(H, W)) + 1e-8
+        output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
+        if max(output_resolution) < img_size:
+            # let's put the max dimension to img_size
+            scale_final = (img_size / max(H, W)) + 1e-8
+            output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
+
+        input_rgb_image, depth_mask, input_camera_intrinsics = (
+            cropping.rescale_image_depthmap(
+                input_rgb_image, depth_mask, input_camera_intrinsics, output_resolution
+            )
+        )
+        input_depthmap = depth_mask[:, :, 0]
+        input_mask = depth_mask[:, :, 1]
+
+        # generate and adjust camera pose
+        camera_pose = np.eye(4, dtype=np.float32)
+        camera_pose[:3, :3] = R
+        camera_pose[:3, 3] = tvec
+        camera_pose = np.linalg.inv(camera_pose)
+
+        # save crop images and depth, metadata
+        save_img_path = os.path.join(output_dir, filepath)
+        save_depth_path = os.path.join(output_dir, frame_data["depth"]["path"])
+        save_mask_path = os.path.join(output_dir, mask_path)
+        os.makedirs(os.path.split(save_img_path)[0], exist_ok=True)
+        os.makedirs(os.path.split(save_depth_path)[0], exist_ok=True)
+        os.makedirs(os.path.split(save_mask_path)[0], exist_ok=True)
+
+        input_rgb_image.save(save_img_path)
+        scaled_depth_map = (input_depthmap / np.max(input_depthmap) * 65535).astype(
+            np.uint16
+        )
+        cv2.imwrite(save_depth_path, scaled_depth_map)
+        cv2.imwrite(save_mask_path, (input_mask * 255).astype(np.uint8))
+
+        save_meta_path = save_img_path.replace("jpg", "npz")
+        np.savez(
+            save_meta_path,
+            camera_intrinsics=input_camera_intrinsics,
+            camera_pose=camera_pose,
+            maximum_depth=np.max(input_depthmap),
+        )
+
+    return selected_sequences_numbers_dict
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    assert args.co3d_dir != args.output_dir
+    if args.category is None:
+        if args.single_sequence_subset:
+            categories = SINGLE_SEQUENCE_CATEGORIES
+        else:
+            categories = CATEGORIES
+    else:
+        categories = [args.category]
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    for split in ["train", "test"]:
+        selected_sequences_path = os.path.join(
+            args.output_dir, f"selected_seqs_{split}.json"
+        )
+        if os.path.isfile(selected_sequences_path):
+            continue
+
+        all_selected_sequences = {}
+        for category in categories:
+            category_output_dir = osp.join(args.output_dir, category)
+            os.makedirs(category_output_dir, exist_ok=True)
+            category_selected_sequences_path = os.path.join(
+                category_output_dir, f"selected_seqs_{split}.json"
+            )
+            if os.path.isfile(category_selected_sequences_path):
+                with open(category_selected_sequences_path, "r") as fid:
+                    category_selected_sequences = json.load(fid)
+            else:
+                print(f"Processing {split} - category = {category}")
+                category_selected_sequences = prepare_sequences(
+                    category=category,
+                    co3d_dir=args.co3d_dir,
+                    output_dir=args.output_dir,
+                    img_size=args.img_size,
+                    split=split,
+                    min_quality=args.min_quality,
+                    max_num_sequences_per_object=args.num_sequences_per_object,
+                    seed=args.seed + CATEGORIES_IDX[category],
+                    is_single_sequence_subset=args.single_sequence_subset,
+                )
+                with open(category_selected_sequences_path, "w") as file:
+                    json.dump(category_selected_sequences, file)
+
+            all_selected_sequences[category] = category_selected_sequences
+        with open(selected_sequences_path, "w") as file:
+            json.dump(all_selected_sequences, file)
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_cop3d.py b/extern/CUT3R/datasets_preprocess/preprocess_cop3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9d3203ba85a2471accb0e85fdf4bd29660c67e4
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_cop3d.py
@@ -0,0 +1,322 @@
+#!/usr/bin/env python3
+
+# --------------------------------------------------------
+# Script to pre-process the COP3D dataset.
+# Usage:
+#   python3 preprocess_cop3d.py --cop3d_dir /path/to/cop3d \
+#       --output_dir /path/to/processed_cop3d
+# --------------------------------------------------------
+
+import argparse
+import random
+import gzip
+import json
+import os
+import os.path as osp
+
+import torch
+import PIL.Image
+import numpy as np
+import cv2
+
+from tqdm.auto import tqdm
+import matplotlib.pyplot as plt
+
+import src.dust3r.datasets.utils.cropping as cropping
+
+# Define the object categories. (These are used for seeding.)
+CATEGORIES = ["cat", "dog"]
+CATEGORIES_IDX = {cat: i for i, cat in enumerate(CATEGORIES)}
+
+
+def get_parser():
+    """Set up the argument parser."""
+    parser = argparse.ArgumentParser(
+        description="Preprocess the CO3D dataset and output processed images, masks, and metadata."
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="",
+        help="Output directory for processed CO3D data.",
+    )
+    parser.add_argument(
+        "--cop3d_dir",
+        type=str,
+        default="",
+        help="Directory containing the raw CO3D data.",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for reproducibility."
+    )
+    parser.add_argument(
+        "--min_quality",
+        type=float,
+        default=0.5,
+        help="Minimum viewpoint quality score.",
+    )
+    parser.add_argument(
+        "--img_size",
+        type=int,
+        default=512,
+        help=(
+            "Lower dimension will be >= img_size * 3/4, and max dimension will be >= img_size"
+        ),
+    )
+    return parser
+
+
+def convert_ndc_to_pinhole(focal_length, principal_point, image_size):
+    """Convert normalized device coordinates to a pinhole camera intrinsic matrix."""
+    focal_length = np.array(focal_length)
+    principal_point = np.array(principal_point)
+    image_size_wh = np.array([image_size[1], image_size[0]])
+    half_image_size = image_size_wh / 2
+    rescale = half_image_size.min()
+    principal_point_px = half_image_size - principal_point * rescale
+    focal_length_px = focal_length * rescale
+    fx, fy = focal_length_px[0], focal_length_px[1]
+    cx, cy = principal_point_px[0], principal_point_px[1]
+    K = np.array([[fx, 0.0, cx], [0.0, fy, cy], [0.0, 0.0, 1.0]], dtype=np.float32)
+    return K
+
+
+def opencv_from_cameras_projection(R, T, focal, p0, image_size):
+    """
+    Convert camera projection parameters from CO3D (NDC) to OpenCV coordinates.
+
+    Returns:
+        R, tvec, camera_matrix: OpenCV-style rotation matrix, translation vector, and intrinsic matrix.
+    """
+    R = torch.from_numpy(R)[None, :, :]
+    T = torch.from_numpy(T)[None, :]
+    focal = torch.from_numpy(focal)[None, :]
+    p0 = torch.from_numpy(p0)[None, :]
+    image_size = torch.from_numpy(image_size)[None, :]
+
+    # Convert to PyTorch3D convention.
+    R_pytorch3d = R.clone()
+    T_pytorch3d = T.clone()
+    focal_pytorch3d = focal
+    p0_pytorch3d = p0
+    T_pytorch3d[:, :2] *= -1
+    R_pytorch3d[:, :, :2] *= -1
+    tvec = T_pytorch3d
+    R = R_pytorch3d.permute(0, 2, 1)
+
+    # Retype image_size (flip to width, height).
+    image_size_wh = image_size.to(R).flip(dims=(1,))
+
+    # Compute scale and principal point.
+    scale = image_size_wh.to(R).min(dim=1, keepdim=True)[0] / 2.0
+    scale = scale.expand(-1, 2)
+    c0 = image_size_wh / 2.0
+    principal_point = -p0_pytorch3d * scale + c0
+    focal_length = focal_pytorch3d * scale
+
+    camera_matrix = torch.zeros_like(R)
+    camera_matrix[:, :2, 2] = principal_point
+    camera_matrix[:, 2, 2] = 1.0
+    camera_matrix[:, 0, 0] = focal_length[:, 0]
+    camera_matrix[:, 1, 1] = focal_length[:, 1]
+    return R[0], tvec[0], camera_matrix[0]
+
+
+def get_set_list(category_dir, split):
+    """Obtain a list of sequences for a given category and split."""
+    listfiles = os.listdir(osp.join(category_dir, "set_lists"))
+    subset_list_files = [f for f in listfiles if "manyview" in f]
+    if len(subset_list_files) <= 0:
+        subset_list_files = [f for f in listfiles if "fewview" in f]
+
+    sequences_all = []
+    for subset_list_file in subset_list_files:
+        with open(osp.join(category_dir, "set_lists", subset_list_file)) as f:
+            subset_lists_data = json.load(f)
+            sequences_all.extend(subset_lists_data[split])
+    return sequences_all
+
+
+def prepare_sequences(
+    category, cop3d_dir, output_dir, img_size, split, min_quality, seed
+):
+    """
+    Process sequences for a given category and split.
+
+    This function loads per-frame and per-sequence annotations,
+    filters sequences based on quality, crops and rescales images,
+    and saves metadata for each frame.
+
+    Returns a dictionary mapping sequence names to lists of selected frame indices.
+    """
+    random.seed(seed)
+    category_dir = osp.join(cop3d_dir, category)
+    category_output_dir = osp.join(output_dir, category)
+    sequences_all = get_set_list(category_dir, split)
+
+    # Get unique sequence names.
+    sequences_numbers = sorted(set(seq_name for seq_name, _, _ in sequences_all))
+
+    # Load frame and sequence annotation files.
+    frame_file = osp.join(category_dir, "frame_annotations.jgz")
+    sequence_file = osp.join(category_dir, "sequence_annotations.jgz")
+
+    with gzip.open(frame_file, "r") as fin:
+        frame_data = json.loads(fin.read())
+    with gzip.open(sequence_file, "r") as fin:
+        sequence_data = json.loads(fin.read())
+
+    # Organize frame annotations per sequence.
+    frame_data_processed = {}
+    for f_data in frame_data:
+        sequence_name = f_data["sequence_name"]
+        frame_data_processed.setdefault(sequence_name, {})[
+            f_data["frame_number"]
+        ] = f_data
+
+    # Select sequences with quality above the threshold.
+    good_quality_sequences = set()
+    for seq_data in sequence_data:
+        if seq_data["viewpoint_quality_score"] > min_quality:
+            good_quality_sequences.add(seq_data["sequence_name"])
+    sequences_numbers = [
+        seq_name for seq_name in sequences_numbers if seq_name in good_quality_sequences
+    ]
+    selected_sequences_numbers = sequences_numbers
+    selected_sequences_numbers_dict = {
+        seq_name: [] for seq_name in selected_sequences_numbers
+    }
+
+    # Filter frames to only those from selected sequences.
+    sequences_all = [
+        (seq_name, frame_number, filepath)
+        for seq_name, frame_number, filepath in sequences_all
+        if seq_name in selected_sequences_numbers_dict
+    ]
+
+    # Process each frame.
+    for seq_name, frame_number, filepath in tqdm(
+        sequences_all, desc="Processing frames"
+    ):
+        frame_idx = int(filepath.split("/")[-1][5:-4])
+        selected_sequences_numbers_dict[seq_name].append(frame_idx)
+        mask_path = filepath.replace("images", "masks").replace(".jpg", ".png")
+        frame_data_entry = frame_data_processed[seq_name][frame_number]
+        focal_length = frame_data_entry["viewpoint"]["focal_length"]
+        principal_point = frame_data_entry["viewpoint"]["principal_point"]
+        image_size = frame_data_entry["image"]["size"]
+        K = convert_ndc_to_pinhole(focal_length, principal_point, image_size)
+        R, tvec, camera_intrinsics = opencv_from_cameras_projection(
+            np.array(frame_data_entry["viewpoint"]["R"]),
+            np.array(frame_data_entry["viewpoint"]["T"]),
+            np.array(focal_length),
+            np.array(principal_point),
+            np.array(image_size),
+        )
+
+        # Load input image and mask.
+        image_path = osp.join(cop3d_dir, filepath)
+        mask_path_full = osp.join(cop3d_dir, mask_path)
+        input_rgb_image = PIL.Image.open(image_path).convert("RGB")
+        input_mask = plt.imread(mask_path_full)
+        H, W = input_mask.shape
+
+        camera_intrinsics = camera_intrinsics.numpy()
+        cx, cy = camera_intrinsics[:2, 2].round().astype(int)
+        min_margin_x = min(cx, W - cx)
+        min_margin_y = min(cy, H - cy)
+        l, t = cx - min_margin_x, cy - min_margin_y
+        r, b = cx + min_margin_x, cy + min_margin_y
+        crop_bbox = (l, t, r, b)
+
+        # Crop the image, mask, and adjust intrinsics.
+        input_rgb_image, input_mask, input_camera_intrinsics = (
+            cropping.crop_image_depthmap(
+                input_rgb_image, input_mask, camera_intrinsics, crop_bbox
+            )
+        )
+        scale_final = ((img_size * 3 // 4) / min(H, W)) + 1e-8
+        output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
+        if max(output_resolution) < img_size:
+            scale_final = (img_size / max(H, W)) + 1e-8
+            output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
+        input_rgb_image, input_mask, input_camera_intrinsics = (
+            cropping.rescale_image_depthmap(
+                input_rgb_image, input_mask, input_camera_intrinsics, output_resolution
+            )
+        )
+
+        # Generate and adjust camera pose.
+        camera_pose = np.eye(4, dtype=np.float32)
+        camera_pose[:3, :3] = R
+        camera_pose[:3, 3] = tvec
+        camera_pose = np.linalg.inv(camera_pose)
+
+        # Save processed image and mask.
+        save_img_path = osp.join(output_dir, filepath)
+        save_mask_path = osp.join(output_dir, mask_path)
+        os.makedirs(osp.split(save_img_path)[0], exist_ok=True)
+        os.makedirs(osp.split(save_mask_path)[0], exist_ok=True)
+        input_rgb_image.save(save_img_path)
+        cv2.imwrite(save_mask_path, (input_mask * 255).astype(np.uint8))
+
+        # Save metadata (intrinsics and pose).
+        save_meta_path = save_img_path.replace("jpg", "npz")
+        np.savez(
+            save_meta_path,
+            camera_intrinsics=input_camera_intrinsics,
+            camera_pose=camera_pose,
+        )
+
+    return selected_sequences_numbers_dict
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    assert (
+        args.cop3d_dir != args.output_dir
+    ), "Input and output directories must differ."
+    categories = CATEGORIES
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Process each split separately.
+    for split in ["train", "test"]:
+        selected_sequences_path = osp.join(
+            args.output_dir, f"selected_seqs_{split}.json"
+        )
+        if os.path.isfile(selected_sequences_path):
+            continue
+
+        all_selected_sequences = {}
+        for category in categories:
+            category_output_dir = osp.join(args.output_dir, category)
+            os.makedirs(category_output_dir, exist_ok=True)
+            category_selected_sequences_path = osp.join(
+                category_output_dir, f"selected_seqs_{split}.json"
+            )
+            if os.path.isfile(category_selected_sequences_path):
+                with open(category_selected_sequences_path, "r") as fid:
+                    category_selected_sequences = json.load(fid)
+            else:
+                print(f"Processing {split} - category = {category}")
+                category_selected_sequences = prepare_sequences(
+                    category=category,
+                    cop3d_dir=args.cop3d_dir,
+                    output_dir=args.output_dir,
+                    img_size=args.img_size,
+                    split=split,
+                    min_quality=args.min_quality,
+                    seed=args.seed + CATEGORIES_IDX[category],
+                )
+                with open(category_selected_sequences_path, "w") as file:
+                    json.dump(category_selected_sequences, file)
+
+            all_selected_sequences[category] = category_selected_sequences
+
+        with open(selected_sequences_path, "w") as file:
+            json.dump(all_selected_sequences, file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_dl3dv.py b/extern/CUT3R/datasets_preprocess/preprocess_dl3dv.py
new file mode 100644
index 0000000000000000000000000000000000000000..434c6ce19bc569662515969e5e389e43b5c8e73f
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_dl3dv.py
@@ -0,0 +1,188 @@
+import argparse
+import random
+import gzip
+import json
+import os
+import sys
+
+import os.path as osp
+
+import torch
+import PIL.Image
+from PIL import Image
+import numpy as np
+import cv2
+
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import shutil
+from read_write_model import run
+
+import torch
+import torchvision
+
+
+def get_parser():
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dl3dv_dir", default="../DL3DV-Dense/3K/")  # TODO
+    parser.add_argument("--output_dir", default="../processed_dl3dv/3K/")  # TODO
+    return parser
+
+
+from scipy.spatial.transform import Rotation as R
+
+
+def read_array(path):
+    with open(path, "rb") as fid:
+        width, height, channels = np.genfromtxt(
+            fid, delimiter="&", max_rows=1, usecols=(0, 1, 2), dtype=int
+        )
+        fid.seek(0)
+        num_delimiter = 0
+        byte = fid.read(1)
+        while True:
+            if byte == b"&":
+                num_delimiter += 1
+                if num_delimiter >= 3:
+                    break
+            byte = fid.read(1)
+        array = np.fromfile(fid, np.float32)
+    array = array.reshape((width, height, channels), order="F")
+    return np.transpose(array, (1, 0, 2)).squeeze()
+
+
+def main(rootdir, outdir):
+    os.makedirs(outdir, exist_ok=True)
+
+    envs = [f for f in os.listdir(rootdir) if os.path.isdir(osp.join(rootdir, f))]
+    for env in tqdm(envs):
+        subseqs = [
+            f
+            for f in os.listdir(osp.join(rootdir, env))
+            if os.path.isdir(osp.join(rootdir, env, f)) and f.startswith("dense")
+        ]
+        for subseq in subseqs:
+            sparse_dir = osp.join(rootdir, env, subseq, "sparse")
+            images_dir = osp.join(rootdir, env, subseq, "images")
+            # depth_dir = osp.join(rootdir, env, subseq, "stereo", "depth_maps")
+            if (
+                (not os.path.exists(sparse_dir))
+                or (not os.path.exists(images_dir))
+                # or (not os.path.exists(depth_dir))
+            ):
+                continue
+            intrins_file = sparse_dir + "/cameras.txt"
+            poses_file = sparse_dir + "/images.txt"
+            if os.path.exists(intrins_file) and os.path.exists(poses_file):
+                continue
+            run(sparse_dir, sparse_dir)
+
+            cam_params = {}
+            with open(intrins_file, "r") as f:
+                for line in f:
+                    if line.startswith("#"):
+                        continue
+                    parts = line.strip().split()
+                    if len(parts) == 0:
+                        continue
+                    cam_id = int(parts[0])
+                    fx = float(parts[4])
+                    fy = float(parts[5])
+                    cx = float(parts[6])
+                    cy = float(parts[7])
+                    cam_params[cam_id] = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
+
+            poses = []
+            images = []
+            intrinsics = []
+
+            with open(poses_file, "r") as f:
+                for i, line in enumerate(f):
+                    if line.startswith("#"):
+                        continue
+                    parts = line.strip().split()
+                    if len(parts) == 0:
+                        continue
+                    if "." in parts[0]:
+                        continue
+
+                    img_name = parts[-1]
+                    w, x, y, z = map(float, parts[1:5])
+                    R = np.array(
+                        [
+                            [
+                                1 - 2 * y * y - 2 * z * z,
+                                2 * x * y - 2 * z * w,
+                                2 * x * z + 2 * y * w,
+                            ],
+                            [
+                                2 * x * y + 2 * z * w,
+                                1 - 2 * x * x - 2 * z * z,
+                                2 * y * z - 2 * x * w,
+                            ],
+                            [
+                                2 * x * z - 2 * y * w,
+                                2 * y * z + 2 * x * w,
+                                1 - 2 * x * x - 2 * y * y,
+                            ],
+                        ]
+                    )
+                    tx, ty, tz = map(float, parts[5:8])
+                    cam_id = int(parts[-2])
+                    pose = np.eye(4)
+                    pose[:3, :3] = R
+                    pose[:3, 3] = [tx, ty, tz]
+                    poses.append(np.linalg.inv(pose))
+                    images.append(img_name)
+                    intrinsics.append(cam_params[cam_id])
+
+            os.makedirs(osp.join(outdir, env, subseq), exist_ok=True)
+            os.makedirs(osp.join(outdir, env, subseq, "rgb"), exist_ok=True)
+            # os.makedirs(osp.join(outdir, env, subseq, "depth"), exist_ok=True)
+            os.makedirs(osp.join(outdir, env, subseq, "cam"), exist_ok=True)
+
+            for i, img_name in enumerate(tqdm(images)):
+                basename = img_name.split("/")[-1]
+                if os.path.exists(
+                    osp.join(
+                        outdir, env, subseq, "cam", basename.replace(".png", ".npz")
+                    )
+                ):
+                    print("Exist!")
+                    continue
+                img_path = os.path.join(images_dir, img_name)
+                # depth_path = os.path.join(depth_dir, img_name + ".geometric.bin")
+                if not os.path.exists(depth_path) or not os.path.exists(img_path):
+                    continue
+                try:
+                    rgb = Image.open(img_path)
+                    # depth = read_array(depth_path)
+                except:
+                    continue
+                intrinsic = intrinsics[i]
+                pose = poses[i]
+
+                # save all
+
+                rgb.save(osp.join(outdir, env, subseq, "rgb", basename))
+                # np.save(
+                #     osp.join(
+                #         outdir, env, subseq, "depth", basename.replace(".png", ".npy")
+                #     ),
+                #     depth,
+                # )
+                np.savez(
+                    osp.join(
+                        outdir, env, subseq, "cam", basename.replace(".png", ".npz")
+                    ),
+                    intrinsic=intrinsic,
+                    pose=pose,
+                )
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.dl3dv_dir, args.output_dir)
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_dynamic_replica.py b/extern/CUT3R/datasets_preprocess/preprocess_dynamic_replica.py
new file mode 100644
index 0000000000000000000000000000000000000000..90b64133f41da31790b459499fec1de8398b737d
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_dynamic_replica.py
@@ -0,0 +1,344 @@
+#!/usr/bin/env python3
+"""
+Preprocess the Dynamic Replica dataset.
+
+This script reads frame annotations (stored in compressed JSON files),
+loads images, depth maps, optical flow, and camera parameters, and saves
+processed images, depth maps, flow files, and camera metadata (intrinsics and poses)
+to an output directory organized by split, sequence, and camera view.
+
+Usage:
+    python preprocess_dynamic_replica.py --root_dir /path/to/data_dynamic_replica \
+                                           --out_dir /path/to/processed_dynamic_replica \
+                                           [--splits train valid test] \
+                                           [--num_processes 8]
+"""
+
+import argparse
+import gzip
+import json
+import os
+import os.path as osp
+import re
+import shutil
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from multiprocessing import Pool, cpu_count
+from typing import List, Optional
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import PIL.Image
+import torch
+from PIL import Image
+from pytorch3d.implicitron.dataset.types import (
+    FrameAnnotation as ImplicitronFrameAnnotation,
+    load_dataclass,
+)
+from tqdm import tqdm
+import imageio
+
+# Enable OpenEXR support in OpenCV.
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+
+TAG_CHAR = np.array([202021.25], np.float32)
+
+
+def readFlow(fn):
+    """Read .flo file in Middlebury format."""
+    with open(fn, "rb") as f:
+        magic = np.fromfile(f, np.float32, count=1)
+        if 202021.25 != magic:
+            print("Magic number incorrect. Invalid .flo file")
+            return None
+        else:
+            w = np.fromfile(f, np.int32, count=1)
+            h = np.fromfile(f, np.int32, count=1)
+            data = np.fromfile(f, np.float32, count=2 * int(w) * int(h))
+            return np.resize(data, (int(h), int(w), 2))
+
+
+def readPFM(file):
+    with open(file, "rb") as f:
+        header = f.readline().rstrip()
+        if header == b"PF":
+            color = True
+        elif header == b"Pf":
+            color = False
+        else:
+            raise Exception("Not a PFM file.")
+
+        dim_match = re.match(rb"^(\d+)\s(\d+)\s$", f.readline())
+        if dim_match:
+            width, height = map(int, dim_match.groups())
+        else:
+            raise Exception("Malformed PFM header.")
+
+        scale = float(f.readline().rstrip())
+        endian = "<" if scale < 0 else ">"
+        if scale < 0:
+            scale = -scale
+
+        data = np.fromfile(f, endian + "f")
+        shape = (height, width, 3) if color else (height, width)
+        data = np.reshape(data, shape)
+        data = np.flipud(data)
+        return data
+
+
+def read_gen(file_name, pil=False):
+    ext = osp.splitext(file_name)[-1].lower()
+    if ext in [".png", ".jpeg", ".ppm", ".jpg"]:
+        return Image.open(file_name)
+    elif ext in [".bin", ".raw"]:
+        return np.load(file_name)
+    elif ext == ".flo":
+        return readFlow(file_name).astype(np.float32)
+    elif ext == ".pfm":
+        flow = readPFM(file_name).astype(np.float32)
+        return flow if len(flow.shape) == 2 else flow[:, :, :-1]
+    return []
+
+
+def _load_16big_png_depth(depth_png):
+    with Image.open(depth_png) as depth_pil:
+        depth = (
+            np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16)
+            .astype(np.float32)
+            .reshape((depth_pil.size[1], depth_pil.size[0]))
+        )
+    return depth
+
+
+@dataclass
+class DynamicReplicaFrameAnnotation(ImplicitronFrameAnnotation):
+    """A dataclass used to load annotations from .json for Dynamic Replica."""
+
+    camera_name: Optional[str] = None
+    instance_id_map_path: Optional[str] = None
+    flow_forward: Optional[str] = None
+    flow_forward_mask: Optional[str] = None
+    flow_backward: Optional[str] = None
+    flow_backward_mask: Optional[str] = None
+    trajectories: Optional[str] = None
+
+
+def _get_pytorch3d_camera(entry_viewpoint, image_size, scale: float):
+    """
+    Convert the camera parameters stored in an annotation to PyTorch3D convention.
+
+    Returns:
+        R, tvec, focal, principal_point
+    """
+    assert entry_viewpoint is not None
+    principal_point = torch.tensor(entry_viewpoint.principal_point, dtype=torch.float)
+    focal_length = torch.tensor(entry_viewpoint.focal_length, dtype=torch.float)
+    half_image_size_wh_orig = (
+        torch.tensor(list(reversed(image_size)), dtype=torch.float) / 2.0
+    )
+
+    fmt = entry_viewpoint.intrinsics_format
+    if fmt.lower() == "ndc_norm_image_bounds":
+        rescale = half_image_size_wh_orig
+    elif fmt.lower() == "ndc_isotropic":
+        rescale = half_image_size_wh_orig.min()
+    else:
+        raise ValueError(f"Unknown intrinsics format: {fmt}")
+
+    principal_point_px = half_image_size_wh_orig - principal_point * rescale
+    focal_length_px = focal_length * rescale
+
+    # Prepare rotation and translation for PyTorch3D
+    R = torch.tensor(entry_viewpoint.R, dtype=torch.float)
+    T = torch.tensor(entry_viewpoint.T, dtype=torch.float)
+    R_pytorch3d = R.clone()
+    T_pytorch3d = T.clone()
+    T_pytorch3d[..., :2] *= -1
+    R_pytorch3d[..., :, :2] *= -1
+    tvec = T_pytorch3d
+
+    return R, tvec, focal_length_px, principal_point_px
+
+
+# Global configuration for splits and output.
+SPLITS = ["train", "valid", "test"]
+# (You can override the default root and out_dir via command-line arguments.)
+
+
+def process_split_data(args):
+    """
+    Process all frames for a given split.
+
+    Reads the frame annotation file for the given split, groups frames per sequence
+    and camera, and for each frame loads the image, depth map, optical flows (if available),
+    computes the camera intrinsics and pose (using _get_pytorch3d_camera), and saves the data.
+    """
+    split, root_dir, out_dir = args
+    split_dir = osp.join(root_dir, split)
+    # The frame annotations are stored in a compressed json file.
+    frame_annotations_file = osp.join(split_dir, f"frame_annotations_{split}.jgz")
+    with gzip.open(frame_annotations_file, "rt", encoding="utf8") as zipfile:
+        frame_annots_list = load_dataclass(zipfile, List[DynamicReplicaFrameAnnotation])
+
+    # Group frames by sequence and camera.
+    seq_annot = defaultdict(lambda: defaultdict(list))
+    for frame_annot in frame_annots_list:
+        seq_annot[frame_annot.sequence_name][frame_annot.camera_name].append(
+            frame_annot
+        )
+
+    # Process each sequence.
+    for seq_name in tqdm(seq_annot.keys(), desc=f"Processing split '{split}'"):
+        # For each camera (e.g., 'left', 'right'), create output directories.
+        for cam in ["left", "right"]:
+            out_img_dir = osp.join(out_dir, split, seq_name, cam, "rgb")
+            out_depth_dir = osp.join(out_dir, split, seq_name, cam, "depth")
+            out_fflow_dir = osp.join(out_dir, split, seq_name, cam, "flow_forward")
+            out_bflow_dir = osp.join(out_dir, split, seq_name, cam, "flow_backward")
+            out_cam_dir = osp.join(out_dir, split, seq_name, cam, "cam")
+            os.makedirs(out_img_dir, exist_ok=True)
+            os.makedirs(out_depth_dir, exist_ok=True)
+            os.makedirs(out_fflow_dir, exist_ok=True)
+            os.makedirs(out_bflow_dir, exist_ok=True)
+            os.makedirs(out_cam_dir, exist_ok=True)
+
+            for framedata in tqdm(
+                seq_annot[seq_name][cam], desc=f"Seq {seq_name} [{cam}]", leave=False
+            ):
+                timestamp = framedata.frame_timestamp
+                im_path = osp.join(split_dir, framedata.image.path)
+                depth_path = osp.join(split_dir, framedata.depth.path)
+                if framedata.flow_forward["path"]:
+                    flow_forward_path = osp.join(
+                        split_dir, framedata.flow_forward["path"]
+                    )
+                    flow_forward_mask_path = osp.join(
+                        split_dir, framedata.flow_forward_mask["path"]
+                    )
+                if framedata.flow_backward["path"]:
+                    flow_backward_path = osp.join(
+                        split_dir, framedata.flow_backward["path"]
+                    )
+                    flow_backward_mask_path = osp.join(
+                        split_dir, framedata.flow_backward_mask["path"]
+                    )
+
+                # Ensure required files exist.
+                assert os.path.isfile(im_path), im_path
+                assert os.path.isfile(depth_path), depth_path
+                if framedata.flow_forward["path"]:
+                    assert os.path.isfile(flow_forward_path), flow_forward_path
+                    assert os.path.isfile(
+                        flow_forward_mask_path
+                    ), flow_forward_mask_path
+                if framedata.flow_backward["path"]:
+                    assert os.path.isfile(flow_backward_path), flow_backward_path
+                    assert os.path.isfile(
+                        flow_backward_mask_path
+                    ), flow_backward_mask_path
+
+                viewpoint = framedata.viewpoint
+                # Load depth map.
+                depth = _load_16big_png_depth(depth_path)
+
+                # Process optical flow if available.
+                if framedata.flow_forward["path"]:
+                    flow_forward = cv2.imread(flow_forward_path, cv2.IMREAD_UNCHANGED)
+                    flow_forward_mask = cv2.imread(
+                        flow_forward_mask_path, cv2.IMREAD_UNCHANGED
+                    )
+                    np.savez(
+                        osp.join(out_fflow_dir, f"{timestamp}.npz"),
+                        flow=flow_forward,
+                        mask=flow_forward_mask,
+                    )
+                if framedata.flow_backward["path"]:
+                    flow_backward = cv2.imread(flow_backward_path, cv2.IMREAD_UNCHANGED)
+                    flow_backward_mask = cv2.imread(
+                        flow_backward_mask_path, cv2.IMREAD_UNCHANGED
+                    )
+                    np.savez(
+                        osp.join(out_bflow_dir, f"{timestamp}.npz"),
+                        flow=flow_backward,
+                        mask=flow_backward_mask,
+                    )
+
+                # Get camera parameters.
+                R, t, focal, pp = _get_pytorch3d_camera(
+                    viewpoint, framedata.image.size, scale=1.0
+                )
+                intrinsics = np.eye(3)
+                intrinsics[0, 0] = focal[0].item()
+                intrinsics[1, 1] = focal[1].item()
+                intrinsics[0, 2] = pp[0].item()
+                intrinsics[1, 2] = pp[1].item()
+                pose = np.eye(4)
+                # Invert the camera pose.
+                pose[:3, :3] = R.numpy().T
+                pose[:3, 3] = -R.numpy().T @ t.numpy()
+
+                # Define output file paths.
+                out_img_path = osp.join(out_img_dir, f"{timestamp}.png")
+                out_depth_path = osp.join(out_depth_dir, f"{timestamp}.npy")
+                out_cam_path = osp.join(out_cam_dir, f"{timestamp}.npz")
+
+                # Copy RGB image.
+                shutil.copy(im_path, out_img_path)
+                # Save depth.
+                np.save(out_depth_path, depth)
+                # Save camera metadata.
+                np.savez(out_cam_path, intrinsics=intrinsics, pose=pose)
+    # (Optionally, you could return some summary information.)
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Preprocess Dynamic Replica dataset: convert raw annotations, images, "
+        "depth, and flow files to a processed format."
+    )
+    parser.add_argument(
+        "--root_dir",
+        type=str,
+        required=True,
+        help="Root directory of the Dynamic Replica data.",
+    )
+    parser.add_argument(
+        "--out_dir",
+        type=str,
+        required=True,
+        help="Output directory for processed data.",
+    )
+    parser.add_argument(
+        "--splits",
+        type=str,
+        nargs="+",
+        default=SPLITS,
+        help="List of splits to process (default: train valid test).",
+    )
+    parser.add_argument(
+        "--num_processes",
+        type=int,
+        default=cpu_count(),
+        help="Number of processes to use (default: number of CPU cores).",
+    )
+    args = parser.parse_args()
+
+    os.makedirs(args.out_dir, exist_ok=True)
+    tasks = [(split, args.root_dir, args.out_dir) for split in args.splits]
+
+    print("Processing splits:", args.splits)
+    with Pool(processes=args.num_processes) as pool:
+        list(
+            tqdm(
+                pool.imap(process_split_data, tasks),
+                total=len(tasks),
+                desc="Overall Progress",
+            )
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_eden.py b/extern/CUT3R/datasets_preprocess/preprocess_eden.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2705c04ad9eb8dc54b81cb68722f5604984c327
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_eden.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+"""
+Preprocess the Eden dataset.
+
+This script processes the Eden dataset by copying RGB images, converting depth
+data from .mat files to .npy format, and saving camera intrinsics from .mat files
+into a structured output directory. Files are processed in parallel using
+a ProcessPoolExecutor.
+
+Usage:
+    python preprocess_eden.py --root /path/to/data_raw_videos/data_eden \
+                              --out_dir /path/to/data_raw_videos/processed_eden \
+                              [--num_workers N]
+"""
+
+import os
+import shutil
+import scipy.io
+import numpy as np
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import argparse
+
+
+def process_basename(args):
+    """
+    Process a single basename: load the corresponding image, depth, and camera
+    intrinsics files, then copy/save them into the output directories.
+
+    Parameters:
+        args (tuple): Contains (seq, basename, rgb_dir, depth_dir, cam_dir,
+                      out_rgb_dir, out_depth_dir, out_cam_dir)
+    Returns:
+        None on success or an error message string on failure.
+    """
+    (
+        seq,
+        basename,
+        rgb_dir,
+        depth_dir,
+        cam_dir,
+        out_rgb_dir,
+        out_depth_dir,
+        out_cam_dir,
+    ) = args
+    # Define output paths.
+    out_img_path = os.path.join(out_rgb_dir, f"{basename}.png")
+    out_depth_path = os.path.join(out_depth_dir, f"{basename}.npy")
+    out_cam_path = os.path.join(out_cam_dir, f"{basename}.npz")
+
+    # Skip processing if the camera file has already been saved.
+    if os.path.exists(out_cam_path):
+        return None
+
+    try:
+        cam_type = "L"
+        img_file = os.path.join(rgb_dir, f"{basename}_{cam_type}.png")
+        depth_file = os.path.join(depth_dir, f"{basename}_{cam_type}.mat")
+        cam_file = os.path.join(cam_dir, f"{basename}.mat")
+
+        # Check if the required files exist.
+        if not (
+            os.path.exists(img_file)
+            and os.path.exists(depth_file)
+            and os.path.exists(cam_file)
+        ):
+            return f"Missing files for {basename} in {seq}"
+
+        # Load depth data.
+        depth_mat = scipy.io.loadmat(depth_file)
+        depth = depth_mat.get("Depth")
+        if depth is None:
+            return f"Depth data missing in {depth_file}"
+        depth = depth[..., 0]
+
+        # Load camera intrinsics.
+        cam_mat = scipy.io.loadmat(cam_file)
+        intrinsics = cam_mat.get(f"K_{cam_type}")
+        if intrinsics is None:
+            return f"Intrinsics data missing in {cam_file}"
+
+        # Copy the RGB image.
+        shutil.copyfile(img_file, out_img_path)
+        # Save the depth data.
+        np.save(out_depth_path, depth)
+        # Save the camera intrinsics.
+        np.savez(out_cam_path, intrinsics=intrinsics)
+
+    except Exception as e:
+        return f"Error processing {basename} in {seq}: {e}"
+
+    return None  # Indicate success.
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Preprocess Eden dataset: copy RGB images, process depth maps, and save camera intrinsics."
+    )
+    parser.add_argument(
+        "--root", type=str, default="", help="Root directory of the raw Eden data."
+    )
+    parser.add_argument(
+        "--out_dir",
+        type=str,
+        default="",
+        help="Output directory for processed Eden data.",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=os.cpu_count(),
+        help="Number of worker processes to use.",
+    )
+    args = parser.parse_args()
+
+    root = args.root
+    out_dir = args.out_dir
+    # Modes typically found in the Eden dataset.
+    modes = ["clear", "cloudy", "overcast", "sunset", "twilight"]
+
+    rgb_root = os.path.join(root, "RGB")
+    depth_root = os.path.join(root, "Depth")
+    cam_root = os.path.join(root, "cam_matrix")
+
+    # Collect sequence directories by traversing the RGB root.
+    seq_dirs = []
+    for d in os.listdir(rgb_root):
+        for m in modes:
+            seq_path = os.path.join(rgb_root, d, m)
+            if os.path.isdir(seq_path):
+                # Save the relative path (e.g., "d/m").
+                seq_dirs.append(os.path.join(d, m))
+
+    all_tasks = []
+    for seq in seq_dirs:
+        rgb_dir = os.path.join(rgb_root, seq)
+        depth_dir = os.path.join(depth_root, seq)
+        cam_dir = os.path.join(cam_root, seq)
+
+        # Create output directories for this sequence.
+        # Replace any os.sep in the sequence name with an underscore.
+        seq_name = "_".join(seq.split(os.sep))
+        out_rgb_dir = os.path.join(out_dir, seq_name, "rgb")
+        out_depth_dir = os.path.join(out_dir, seq_name, "depth")
+        out_cam_dir = os.path.join(out_dir, seq_name, "cam")
+        os.makedirs(out_rgb_dir, exist_ok=True)
+        os.makedirs(out_depth_dir, exist_ok=True)
+        os.makedirs(out_cam_dir, exist_ok=True)
+
+        # Get basenames from the camera directory (assuming file extension .mat).
+        basenames = sorted([d[:-4] for d in os.listdir(cam_dir) if d.endswith(".mat")])
+
+        for basename in basenames:
+            task = (
+                seq,
+                basename,
+                rgb_dir,
+                depth_dir,
+                cam_dir,
+                out_rgb_dir,
+                out_depth_dir,
+                out_cam_dir,
+            )
+            all_tasks.append(task)
+
+    num_workers = args.num_workers
+    print(f"Processing {len(all_tasks)} tasks using {num_workers} workers...")
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        futures = {
+            executor.submit(process_basename, task): task[1] for task in all_tasks
+        }
+        for future in tqdm(
+            as_completed(futures), total=len(futures), desc="Processing tasks"
+        ):
+            error = future.result()
+            if error:
+                print(error)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_hoi4d.py b/extern/CUT3R/datasets_preprocess/preprocess_hoi4d.py
new file mode 100644
index 0000000000000000000000000000000000000000..480e3f822c766ed9c8af547c5b16d4e34ad66938
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_hoi4d.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""
+HOI4D Preprocessing Script
+
+This script processes HOI4D data by:
+  1. Searching specific subdirectories for RGB and depth images.
+  2. Reading camera intrinsics from a .npy file (one per high-level scene).
+  3. Rescaling the RGB images and depth maps to a fixed output resolution
+     (e.g., 640x480) using the 'cropping' module.
+  4. Saving results (RGB, .npy depth, .npz camera intrinsics) in a new directory structure.
+
+Usage:
+    python preprocess_hoi4d.py \
+        --root_dir /path/to/HOI4D_release \
+        --cam_root /path/to/camera_params \
+        --out_dir /path/to/processed_hoi4d
+"""
+
+import os
+import glob
+import cv2
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor
+import argparse
+
+import src.dust3r.datasets.utils.cropping as cropping
+
+def parse_arguments():
+    """
+    Parse command-line arguments for HOI4D preprocessing.
+
+    Returns:
+        argparse.Namespace: The parsed arguments.
+    """
+    parser = argparse.ArgumentParser(
+        description="Preprocess HOI4D dataset by rescaling RGB and depth images."
+    )
+    parser.add_argument("--root_dir", required=True,
+                        help="Path to the HOI4D_release directory.")
+    parser.add_argument("--cam_root", required=True,
+                        help="Path to the directory containing camera intrinsics.")
+    parser.add_argument("--out_dir", required=True,
+                        help="Path to the directory where processed files will be saved.")
+    parser.add_argument("--max_workers", type=int, default=None,
+                        help="Number of parallel workers. Default uses half of available CPU cores.")
+    args = parser.parse_args()
+    return args
+
+def process_image(args):
+    """
+    Process a single image and depth map:
+      - Loads the image (using PIL) and depth (using OpenCV).
+      - Converts depth from mm to meters (divided by 1000).
+      - Rescales both using 'cropping.rescale_image_depthmap'.
+      - Saves the rescaled image (.png), depth (.npy), and camera intrinsics (.npz).
+
+    Args:
+        args (tuple): A tuple of:
+          (img_path, depth_path, out_img_path, out_depth_path, out_cam_path, intrinsics)
+
+    Returns:
+        None. Errors are printed to the console but do not stop the workflow.
+    """
+    img_path, depth_path, out_img_path, out_depth_path, out_cam_path, intrinsics = args
+
+    try:
+        # Load image
+        img = Image.open(img_path)
+
+        # Load depth (in mm) and convert to meters
+        depth = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH)
+        if depth is None:
+            raise ValueError(f"Could not read depth image: {depth_path}")
+        depth = depth.astype(np.float32) / 1000.0
+
+        # Rescale image and depth map
+        img_rescaled, depth_rescaled, intrinsics_rescaled = cropping.rescale_image_depthmap(
+            img, depth, intrinsics.copy(), (640, 480)
+        )
+
+        # Save processed data
+        img_rescaled.save(out_img_path)      # PNG image
+        np.save(out_depth_path, depth_rescaled)  # Depth .npy
+        np.savez(out_cam_path, intrinsics=intrinsics_rescaled)
+
+    except Exception as e:
+        print(f"Error processing {img_path}: {e}")
+
+def main():
+    args = parse_arguments()
+
+    root = args.root_dir
+    cam_root = args.cam_root
+    out_dir = args.out_dir
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir, exist_ok=True)
+
+    # Collect a list of subdirectories using a glob pattern
+    # e.g.: root/ZY2021*/H*/C*/N*/S*/s*/T*
+    scene_dirs = glob.glob(os.path.join(root, "ZY2021*", "H*", "C*", "N*", "S*", "s*", "T*"))
+
+    # Build tasks
+    tasks = []
+    for scene_dir in tqdm(scene_dirs, desc="Collecting scenes"):
+        # Build an output sub-directory name
+        # Example: "ZY202101/H1/C1/N1/S1/s1/T1" -> "ZY202101_H1_C1_N1_S1_s1_T1"
+        scene_relpath = os.path.relpath(scene_dir, root)
+        scene_name = "_".join(scene_relpath.split(os.sep))
+
+        # Load camera intrinsics from a .npy file in cam_root
+        # e.g., first token of scene_relpath might point to the relevant .npy
+        # "ZY202101" -> "cam_root/ZY202101/intrin.npy" (adjust logic as needed)
+        top_level = scene_relpath.split(os.sep)[0]
+        cam_file = os.path.join(cam_root, top_level, "intrin.npy")
+        if not os.path.isfile(cam_file):
+            print(f"Warning: Camera file not found: {cam_file}. Skipping {scene_dir}")
+            continue
+        intrinsics = np.load(cam_file)
+
+        # Directories for this sequence
+        rgb_dir = os.path.join(scene_dir, "align_rgb")
+        depth_dir = os.path.join(scene_dir, "align_depth")
+
+        # Output directories
+        out_rgb_dir = os.path.join(out_dir, scene_name, "rgb")
+        out_depth_dir = os.path.join(out_dir, scene_name, "depth")
+        out_cam_dir = os.path.join(out_dir, scene_name, "cam")
+        os.makedirs(out_rgb_dir, exist_ok=True)
+        os.makedirs(out_depth_dir, exist_ok=True)
+        os.makedirs(out_cam_dir, exist_ok=True)
+
+        # Find all image paths
+        img_paths = sorted(glob.glob(os.path.join(rgb_dir, "*.jpg")))
+
+        # Build tasks for each image
+        for img_path in img_paths:
+            basename = os.path.splitext(os.path.basename(img_path))[0]
+            depth_path = os.path.join(depth_dir, f"{basename}.png")
+
+            out_img_path = os.path.join(out_rgb_dir, f"{basename}.png")
+            out_depth_path = os.path.join(out_depth_dir, f"{basename}.npy")
+            out_cam_path = os.path.join(out_cam_dir, f"{basename}.npz")
+
+            # Skip if already processed
+            if (os.path.exists(out_img_path) and os.path.exists(out_depth_path) and
+                    os.path.exists(out_cam_path)):
+                continue
+
+            task = (
+                img_path,
+                depth_path,
+                out_img_path,
+                out_depth_path,
+                out_cam_path,
+                intrinsics
+            )
+            tasks.append(task)
+
+    # Process tasks in parallel
+    max_workers = args.max_workers
+    if max_workers is None:
+        max_workers = max(1, os.cpu_count() // 2)
+
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        list(tqdm(
+            executor.map(process_image, tasks),
+            total=len(tasks),
+            desc="Processing images"
+        ))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_hypersim.py b/extern/CUT3R/datasets_preprocess/preprocess_hypersim.py
new file mode 100644
index 0000000000000000000000000000000000000000..c00fcd7fe903f74669c2d89f1d0828e19cb219b0
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_hypersim.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python3
+"""
+Preprocess the Hypersim dataset.
+
+This script reads camera parameters from a CSV file, converts an OpenGL-style
+projection matrix into a camera intrinsic matrix, applies tone mapping, and
+saves processed RGB images, depth maps, and camera metadata into an output
+directory. Processing is done per scene and per camera view.
+
+Usage:
+    python preprocess_hypersim.py --hypersim_dir /path/to/hypersim \
+                                  --output_dir /path/to/processed_hypersim
+"""
+
+import argparse
+import os
+import shutil
+import time
+
+import cv2
+import h5py
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from PIL import Image
+from tqdm import tqdm
+
+# Ensure OpenEXR support for OpenCV.
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Preprocess the Hypersim dataset by converting projection "
+        "matrices, applying tone mapping, and saving processed outputs."
+    )
+    parser.add_argument(
+        "--hypersim_dir",
+        default="/path/to/hypersim",
+        help="Root directory of the Hypersim dataset.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="/path/to/processed_hypersim",
+        help="Output directory for processed Hypersim data.",
+    )
+    return parser
+
+
+def opengl_to_intrinsics(proj_matrix, width_pixels, height_pixels):
+    # Extract parameters from the projection matrix.
+    K00 = proj_matrix[0, 0] * width_pixels / 2.0
+    K01 = -proj_matrix[0, 1] * width_pixels / 2.0
+    K02 = (1.0 - proj_matrix[0, 2]) * width_pixels / 2.0
+    K11 = proj_matrix[1, 1] * height_pixels / 2.0
+    K12 = (1.0 + proj_matrix[1, 2]) * height_pixels / 2.0
+    return np.array([[K00, K01, K02], [0.0, K11, K12], [0.0, 0.0, 1.0]])
+
+
+def process_scene(args):
+    rootdir, outdir, scene_name = args
+    scene_outdir = os.path.join(outdir, scene_name)
+    os.makedirs(scene_outdir, exist_ok=True)
+    seq_dir = os.path.join(rootdir, scene_name)
+    seq_detail_dir = os.path.join(seq_dir, "_detail")
+    seq_images_dir = os.path.join(seq_dir, "images")
+
+    # Read global camera parameters from the CSV file.
+    all_metafile = os.path.join(rootdir, "metadata_camera_parameters.csv")
+    df_camera_parameters = pd.read_csv(all_metafile, index_col="scene_name")
+    df_ = df_camera_parameters.loc[scene_name]
+
+    width_pixels = int(df_["settings_output_img_width"])
+    height_pixels = int(df_["settings_output_img_height"])
+
+    M_proj = np.array(
+        [
+            [df_["M_proj_00"], df_["M_proj_01"], df_["M_proj_02"], df_["M_proj_03"]],
+            [df_["M_proj_10"], df_["M_proj_11"], df_["M_proj_12"], df_["M_proj_13"]],
+            [df_["M_proj_20"], df_["M_proj_21"], df_["M_proj_22"], df_["M_proj_23"]],
+            [df_["M_proj_30"], df_["M_proj_31"], df_["M_proj_32"], df_["M_proj_33"]],
+        ]
+    )
+
+    camera_intrinsics = opengl_to_intrinsics(
+        M_proj, width_pixels, height_pixels
+    ).astype(np.float32)
+    if camera_intrinsics[0, 1] != 0:
+        print(f"camera_intrinsics[0, 1] != 0: {camera_intrinsics[0, 1]}")
+        return
+
+    # Read world scale and camera IDs.
+    worldscale = (
+        pd.read_csv(
+            os.path.join(seq_detail_dir, "metadata_scene.csv"),
+            index_col="parameter_name",
+        )
+        .to_numpy()
+        .flatten()[0]
+        .astype(np.float32)
+    )
+    camera_ids = (
+        pd.read_csv(
+            os.path.join(seq_detail_dir, "metadata_cameras.csv"),
+            header=None,
+            skiprows=1,
+        )
+        .to_numpy()
+        .flatten()
+    )
+
+    # Tone mapping parameters.
+    gamma = 1.0 / 2.2  # Standard gamma correction exponent.
+    inv_gamma = 1.0 / gamma
+    percentile = 90  # Desired percentile brightness in the unmodified image.
+    brightness_nth_percentile_desired = 0.8  # Desired brightness after scaling.
+
+    for camera_id in camera_ids:
+        subscene_dir = os.path.join(scene_outdir, f"{camera_id}")
+        os.makedirs(subscene_dir, exist_ok=True)
+        camera_dir = os.path.join(seq_detail_dir, camera_id)
+        if not os.path.exists(camera_dir):
+            print(f"{camera_dir} does not exist.")
+            continue
+        color_dir = os.path.join(seq_images_dir, f"scene_{camera_id}_final_hdf5")
+        geometry_dir = os.path.join(seq_images_dir, f"scene_{camera_id}_geometry_hdf5")
+        if not (os.path.exists(color_dir) and os.path.exists(geometry_dir)):
+            print(f"{color_dir} or {geometry_dir} does not exist.")
+            continue
+
+        camera_positions_hdf5_file = os.path.join(
+            camera_dir, "camera_keyframe_positions.hdf5"
+        )
+        camera_orientations_hdf5_file = os.path.join(
+            camera_dir, "camera_keyframe_orientations.hdf5"
+        )
+
+        with h5py.File(camera_positions_hdf5_file, "r") as f:
+            camera_positions = f["dataset"][:]
+        with h5py.File(camera_orientations_hdf5_file, "r") as f:
+            camera_orientations = f["dataset"][:]
+
+        assert len(camera_positions) == len(
+            camera_orientations
+        ), f"len(camera_positions)={len(camera_positions)} != len(camera_orientations)={len(camera_orientations)}"
+
+        rgbs = sorted([f for f in os.listdir(color_dir) if f.endswith(".color.hdf5")])
+        depths = sorted(
+            [f for f in os.listdir(geometry_dir) if f.endswith(".depth_meters.hdf5")]
+        )
+        assert len(rgbs) == len(
+            depths
+        ), f"len(rgbs)={len(rgbs)} != len(depths)={len(depths)}"
+        exist_frame_ids = [int(f.split(".")[1]) for f in rgbs]
+        valid_camera_positions = camera_positions[exist_frame_ids]
+        valid_camera_orientations = camera_orientations[exist_frame_ids]
+
+        for i, (rgb, depth) in enumerate(tqdm(zip(rgbs, depths), total=len(rgbs))):
+            frame_id = int(rgb.split(".")[1])
+            assert frame_id == int(
+                depth.split(".")[1]
+            ), f"frame_id={frame_id} != {int(depth.split('.')[1])}"
+            # Tone mapping.
+            render_entity = os.path.join(
+                geometry_dir,
+                depth.replace("depth_meters.hdf5", "render_entity_id.hdf5"),
+            )
+            with h5py.File(os.path.join(color_dir, rgb), "r") as f:
+                color = f["dataset"][:]
+            with h5py.File(os.path.join(geometry_dir, depth), "r") as f:
+                distance = f["dataset"][:]
+            R_cam2world = valid_camera_orientations[i]
+            R_cam2world = R_cam2world @ np.array([[1, 0, 0], [0, -1, 0], [0, 0, -1]])
+            t_cam2world = valid_camera_positions[i] * worldscale
+            T_cam2world = np.eye(4)
+            T_cam2world[:3, :3] = R_cam2world
+            T_cam2world[:3, 3] = t_cam2world
+
+            if not np.isfinite(T_cam2world).all():
+                print(f"frame_id={frame_id} T_cam2world is not finite.")
+                continue
+
+            focal = (camera_intrinsics[0, 0] + camera_intrinsics[1, 1]) / 2.0
+            ImageplaneX = (
+                np.linspace(
+                    (-0.5 * width_pixels) + 0.5,
+                    (0.5 * width_pixels) - 0.5,
+                    width_pixels,
+                )
+                .reshape(1, width_pixels)
+                .repeat(height_pixels, 0)
+                .astype(np.float32)[:, :, None]
+            )
+            ImageplaneY = (
+                np.linspace(
+                    (-0.5 * height_pixels) + 0.5,
+                    (0.5 * height_pixels) - 0.5,
+                    height_pixels,
+                )
+                .reshape(height_pixels, 1)
+                .repeat(width_pixels, 1)
+                .astype(np.float32)[:, :, None]
+            )
+            ImageplaneZ = np.full([height_pixels, width_pixels, 1], focal, np.float32)
+            Imageplane = np.concatenate([ImageplaneX, ImageplaneY, ImageplaneZ], axis=2)
+
+            depth = distance / np.linalg.norm(Imageplane, axis=2) * focal
+
+            with h5py.File(render_entity, "r") as f:
+                render_entity_id = f["dataset"][:].astype(np.int32)
+            assert (render_entity_id != 0).all()
+            valid_mask = render_entity_id != -1
+
+            if np.sum(valid_mask) == 0:
+                scale = 1.0  # If there are no valid pixels, set scale to 1.0.
+            else:
+                brightness = (
+                    0.3 * color[:, :, 0] + 0.59 * color[:, :, 1] + 0.11 * color[:, :, 2]
+                )
+                brightness_valid = brightness[valid_mask]
+                eps = 0.0001  # Avoid division by zero.
+                brightness_nth_percentile_current = np.percentile(
+                    brightness_valid, percentile
+                )
+                if brightness_nth_percentile_current < eps:
+                    scale = 0.0
+                else:
+                    scale = (
+                        np.power(brightness_nth_percentile_desired, inv_gamma)
+                        / brightness_nth_percentile_current
+                    )
+
+            color = np.power(np.maximum(scale * color, 0), gamma)
+            color = np.clip(color, 0.0, 1.0)
+
+            out_rgb_path = os.path.join(subscene_dir, f"{frame_id:06d}_rgb.png")
+            Image.fromarray((color * 255).astype(np.uint8)).save(out_rgb_path)
+            out_depth_path = os.path.join(subscene_dir, f"{frame_id:06d}_depth.npy")
+            np.save(out_depth_path, depth.astype(np.float32))
+            out_cam_path = os.path.join(subscene_dir, f"{frame_id:06d}_cam.npz")
+            np.savez(
+                out_cam_path,
+                intrinsics=camera_intrinsics,
+                pose=T_cam2world.astype(np.float32),
+            )
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+
+    # Use placeholder paths to avoid personal/private information.
+    rootdir = args.hypersim_dir  # e.g., '/path/to/hypersim'
+    outdir = args.output_dir  # e.g., '/path/to/processed_hypersim'
+    os.makedirs(outdir, exist_ok=True)
+
+    import multiprocessing
+
+    scenes = sorted(
+        [f for f in os.listdir(rootdir) if os.path.isdir(os.path.join(rootdir, f))]
+    )
+    # Process each scene sequentially (or use multiprocessing if desired)
+    for scene in scenes:
+        process_scene((rootdir, outdir, scene))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_irs.py b/extern/CUT3R/datasets_preprocess/preprocess_irs.py
new file mode 100644
index 0000000000000000000000000000000000000000..50cafb24679a9b86779aba8229823d8cde372e62
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_irs.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+"""
+Preprocess the IRS dataset.
+
+This script converts disparity EXR files into depth maps, copies corresponding RGB images,
+and saves camera intrinsics computed from a given focal length and baseline. Processing is
+done per sequence directory using parallel processing.
+
+Usage:
+    python preprocess_irs.py
+       --root_dir /path/to/data_irs
+       --out_dir /path/to/processed_irs
+"""
+
+import os
+import shutil
+import re
+import glob
+import time
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+import numpy as np
+import OpenEXR
+import Imath
+import imageio
+from PIL import Image
+from tqdm import tqdm
+import argparse
+
+# Ensure OpenEXR support in OpenCV if needed.
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+
+
+def exr2hdr(exrpath):
+    """
+    Read an OpenEXR file and return an HDR image as a NumPy array.
+    """
+    file = OpenEXR.InputFile(exrpath)
+    pixType = Imath.PixelType(Imath.PixelType.FLOAT)
+    dw = file.header()["dataWindow"]
+    num_channels = len(file.header()["channels"].keys())
+    if num_channels > 1:
+        channels = ["R", "G", "B"]
+        num_channels = 3
+    else:
+        channels = ["G"]
+
+    size = (dw.max.x - dw.min.x + 1, dw.max.y - dw.min.y + 1)
+    pixels = [
+        np.fromstring(file.channel(c, pixType), dtype=np.float32) for c in channels
+    ]
+    hdr = np.zeros((size[1], size[0], num_channels), dtype=np.float32)
+    if num_channels == 1:
+        hdr[:, :, 0] = np.reshape(pixels[0], (size[1], size[0]))
+    else:
+        hdr[:, :, 0] = np.reshape(pixels[0], (size[1], size[0]))
+        hdr[:, :, 1] = np.reshape(pixels[1], (size[1], size[0]))
+        hdr[:, :, 2] = np.reshape(pixels[2], (size[1], size[0]))
+    return hdr
+
+
+def writehdr(hdrpath, hdr):
+    """
+    Write an HDR image to a file using the HDR format.
+    If the input has one channel, duplicate it across R, G, and B.
+    """
+    h, w, c = hdr.shape
+    if c == 1:
+        hdr = np.pad(hdr, ((0, 0), (0, 0), (0, 2)), "constant")
+        hdr[:, :, 1] = hdr[:, :, 0]
+        hdr[:, :, 2] = hdr[:, :, 0]
+    imageio.imwrite(hdrpath, hdr, format="hdr")
+
+
+def load_exr(filename):
+    """
+    Load an EXR file and return the HDR image as a NumPy array.
+    """
+    hdr = exr2hdr(filename)
+    h, w, c = hdr.shape
+    if c == 1:
+        hdr = np.squeeze(hdr)
+    return hdr
+
+
+def process_basename(args):
+    """
+    Process a single basename:
+      - Load an RGB image and disparity (EXR) file.
+      - Compute a depth map from disparity using: depth = (baseline * f) / disparity.
+      - Copy the RGB image and save the computed depth and camera intrinsics.
+
+    Parameters:
+      args: tuple containing
+            (basename, seq_dir, out_rgb_dir, out_depth_dir, out_cam_dir, f, baseline)
+    Returns:
+      None on success or an error string on failure.
+    """
+    basename, seq_dir, out_rgb_dir, out_depth_dir, out_cam_dir, f, baseline = args
+    out_img_path = os.path.join(out_rgb_dir, f"{basename}.png")
+    out_depth_path = os.path.join(out_depth_dir, f"{basename}.npy")
+    out_cam_path = os.path.join(out_cam_dir, f"{basename}.npz")
+    if os.path.exists(out_cam_path):
+        return
+
+    try:
+        img_file = os.path.join(seq_dir, f"l_{basename}.png")
+        disp_file = os.path.join(seq_dir, f"d_{basename}.exr")
+
+        # Load image using PIL.
+        img = Image.open(img_file)
+
+        # Load disparity using the custom load_exr function.
+        disp = load_exr(disp_file).astype(np.float32)
+        H, W = disp.shape
+
+        # Verify that the image size matches the disparity map.
+        if img.size != (W, H):
+            return f"Size mismatch for {basename}: Image size {img.size}, Disparity size {(W, H)}"
+
+        # Create a simple camera intrinsics matrix.
+        K = np.eye(3, dtype=np.float32)
+        K[0, 0] = f
+        K[1, 1] = f
+        K[0, 2] = W // 2
+        K[1, 2] = H // 2
+
+        # Compute depth from disparity.
+        depth = baseline * f / disp
+
+        # Copy the RGB image.
+        shutil.copyfile(img_file, out_img_path)
+        # Save the depth map.
+        np.save(out_depth_path, depth)
+        # Save the camera intrinsics.
+        np.savez(out_cam_path, intrinsics=K)
+
+    except Exception as e:
+        return f"Error processing {basename}: {e}"
+
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Preprocess IRS dataset: convert EXR disparity to depth, "
+        "copy RGB images, and save camera intrinsics."
+    )
+    parser.add_argument(
+        "--root_dir",
+        type=str,
+        default="/path/to/data_raw_videos/data_irs",
+        help="Root directory of the raw IRS data.",
+    )
+    parser.add_argument(
+        "--out_dir",
+        type=str,
+        default="/path/to/data_raw_videos/processed_irs",
+        help="Output directory for processed IRS data.",
+    )
+    args = parser.parse_args()
+
+    # Example parameters (adjust as needed)
+    baseline = 0.1
+    f = 480
+
+    root = args.root_dir
+    out_dir = args.out_dir
+
+    # Gather sequence directories.
+    seq_dirs = []
+    for d in os.listdir(root):
+        if os.path.isdir(os.path.join(root, d)):
+            if d == "Store":
+                for sub in os.listdir(os.path.join(root, d)):
+                    if os.path.isdir(os.path.join(root, d, sub)):
+                        seq_dirs.append(os.path.join(d, sub))
+            elif d == "IRS_small":
+                for sub in os.listdir(os.path.join(root, d)):
+                    if os.path.isdir(os.path.join(root, d, sub)):
+                        for subsub in os.listdir(os.path.join(root, d, sub)):
+                            if os.path.isdir(os.path.join(root, d, sub, subsub)):
+                                seq_dirs.append(os.path.join(d, sub, subsub))
+            else:
+                seq_dirs.append(d)
+
+    seq_dirs.sort()
+
+    # Process each sequence.
+    for seq in seq_dirs:
+        seq_dir = os.path.join(root, seq)
+        out_rgb_dir = os.path.join(out_dir, seq, "rgb")
+        out_depth_dir = os.path.join(out_dir, seq, "depth")
+        out_cam_dir = os.path.join(out_dir, seq, "cam")
+
+        os.makedirs(out_rgb_dir, exist_ok=True)
+        os.makedirs(out_depth_dir, exist_ok=True)
+        os.makedirs(out_cam_dir, exist_ok=True)
+
+        # Get basenames from disparity files.
+        basenames = sorted([d[2:-4] for d in os.listdir(seq_dir) if d.endswith(".exr")])
+
+        tasks = []
+        for basename in basenames:
+            task = (
+                basename,
+                seq_dir,
+                out_rgb_dir,
+                out_depth_dir,
+                out_cam_dir,
+                f,
+                baseline,
+            )
+            tasks.append(task)
+
+        num_workers = os.cpu_count() // 2
+        with ProcessPoolExecutor(max_workers=num_workers) as executor:
+            futures = {
+                executor.submit(process_basename, task): task[0] for task in tasks
+            }
+            for future in tqdm(
+                as_completed(futures), total=len(futures), desc=f"Processing {seq}"
+            ):
+                error = future.result()
+                if error:
+                    print(error)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_mapfree.py b/extern/CUT3R/datasets_preprocess/preprocess_mapfree.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9d0829112cf35904c1eef14f4b0dce2014d7535
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_mapfree.py
@@ -0,0 +1,76 @@
+import subprocess
+import os
+import argparse
+import glob
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mapfree_dir",
+        default="mapfree/train/",
+    )
+    parser.add_argument(
+        "--colmap_dir",
+        default="mapfree/colmap",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="processed_mapfree",
+    )
+    return parser
+
+
+def run_patch_match_stereo(root_colmap_dir, root_img_dir):
+    scene_names = sorted(os.listdir(root_colmap_dir))
+    sub_dir_names = ["seq0", "seq1"]
+    for scene_name in scene_names:
+        scene_dir = os.path.join(root_colmap_dir, scene_name)
+        img_dir = os.path.join(root_img_dir, scene_name)
+        for i, sub in enumerate(sub_dir_names):
+            sub_dir = os.path.join(scene_dir, sub)
+            out_dir = os.path.join(scene_dir, f"dense{i}")
+            if not os.path.exists(sub_dir):
+                continue
+            if os.path.exists(out_dir) and os.path.exists(
+                os.path.join(out_dir, f"stereo/depth_maps/{sub}")
+            ):
+                if len(
+                    glob.glob(
+                        os.path.join(out_dir, f"stereo/depth_maps/{sub}/*geometric.bin")
+                    )
+                ) == len(glob.glob(os.path.join(img_dir, sub, "*.jpg"))):
+                    print(f"depth maps already computed, skip {sub_dir}")
+                    continue
+
+            print(sub_dir)
+            cmd = f"colmap image_undistorter \
+                    --image_path {img_dir} \
+                    --input_path {sub_dir} \
+                    --output_path {out_dir} \
+                    --output_type COLMAP;"
+
+            subprocess.call(cmd, shell=True)
+            cmd = f"rm -rf {out_dir}/images/seq{i}; rm -rf {out_dir}/sparse;"
+            cmd += f"cp -r {sub_dir} {out_dir}/sparse;"
+            cmd += f"cp -r {img_dir}/{sub} {out_dir}/images;"
+            subprocess.call(cmd, shell=True)
+
+            # we comment this because we have released the mvs results, but feel free to re-run the mvs
+
+            # cmd = f"colmap patch_match_stereo \
+            #         --workspace_path {out_dir} \
+            #         --workspace_format COLMAP \
+            #         --PatchMatchStereo.cache_size 512 \
+            #         --PatchMatchStereo.geom_consistency true"
+            # subprocess.call(cmd, shell=True)
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    root_colmap_dir = args.colmap_dir
+    root_img_dir = args.mapfree_dir
+
+    # run patch match stereo
+    run_patch_match_stereo(root_colmap_dir, root_img_dir)
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_mapfree2.py b/extern/CUT3R/datasets_preprocess/preprocess_mapfree2.py
new file mode 100644
index 0000000000000000000000000000000000000000..227b50d4ce43a656ee494c78262796e731c28670
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_mapfree2.py
@@ -0,0 +1,123 @@
+import os
+
+
+import os.path as osp
+
+from PIL import Image
+import numpy as np
+
+
+from tqdm import tqdm
+from read_write_model import run
+
+
+def get_parser():
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--mapfree_dir", default="")  # TODO
+    parser.add_argument("--output_dir", default="test_preprocess")  # TODO
+    return parser
+
+
+def main(rootdir, outdir):
+    os.makedirs(outdir, exist_ok=True)
+
+    envs = [f for f in os.listdir(rootdir) if os.path.isdir(osp.join(rootdir, f))]
+    for env in tqdm(envs):
+        subseqs = [
+            f
+            for f in os.listdir(osp.join(rootdir, env))
+            if os.path.isdir(osp.join(rootdir, env, f))
+        ]
+        for subseq in subseqs:
+            sparse_dir = osp.join(rootdir, env, subseq, "sparse")
+            images_dir = osp.join(rootdir, env, subseq, "images")
+            run(sparse_dir, sparse_dir)
+            intrins_file = sparse_dir + "/cameras.txt"
+            poses_file = sparse_dir + "/images.txt"
+
+            cam_params = {}
+            with open(intrins_file, "r") as f:
+                for line in f:
+                    if line.startswith("#"):
+                        continue
+                    parts = line.strip().split()
+                    if len(parts) == 0:
+                        continue
+                    cam_id = int(parts[0])
+                    fx = float(parts[4])
+                    fy = float(parts[5])
+                    cx = float(parts[6])
+                    cy = float(parts[7])
+                    cam_params[cam_id] = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
+
+            poses = []
+            images = []
+            intrinsics = []
+
+            with open(poses_file, "r") as f:
+                for i, line in enumerate(f):
+                    if line.startswith("#"):
+                        continue
+                    parts = line.strip().split()
+                    if len(parts) == 0:
+                        continue
+                    if "." in parts[0]:
+                        continue
+
+                    img_name = parts[-1]
+                    w, x, y, z = map(float, parts[1:5])
+                    R = np.array(
+                        [
+                            [
+                                1 - 2 * y * y - 2 * z * z,
+                                2 * x * y - 2 * z * w,
+                                2 * x * z + 2 * y * w,
+                            ],
+                            [
+                                2 * x * y + 2 * z * w,
+                                1 - 2 * x * x - 2 * z * z,
+                                2 * y * z - 2 * x * w,
+                            ],
+                            [
+                                2 * x * z - 2 * y * w,
+                                2 * y * z + 2 * x * w,
+                                1 - 2 * x * x - 2 * y * y,
+                            ],
+                        ]
+                    )
+                    tx, ty, tz = map(float, parts[5:8])
+                    cam_id = int(parts[-2])
+                    pose = np.eye(4)
+                    pose[:3, :3] = R
+                    pose[:3, 3] = [tx, ty, tz]
+                    poses.append(np.linalg.inv(pose))
+                    images.append(img_name)
+                    intrinsics.append(cam_params[cam_id])
+
+            os.makedirs(osp.join(outdir, env, subseq), exist_ok=True)
+            os.makedirs(osp.join(outdir, env, subseq, "rgb"), exist_ok=True)
+            os.makedirs(osp.join(outdir, env, subseq, "cam"), exist_ok=True)
+
+            for i, img_name in enumerate(tqdm(images)):
+                img_path = os.path.join(images_dir, img_name)
+                rgb = Image.open(img_path)
+                intrinsic = intrinsics[i]
+                pose = poses[i]
+                # save all
+                basename = img_name.split("/")[-1]
+                rgb.save(osp.join(outdir, env, subseq, "rgb", basename))
+                np.savez(
+                    osp.join(
+                        outdir, env, subseq, "cam", basename.replace(".jpg", ".npz")
+                    ),
+                    intrinsic=intrinsic,
+                    pose=pose,
+                )
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.mapfree_dir, args.output_dir)
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_megadepth.py b/extern/CUT3R/datasets_preprocess/preprocess_megadepth.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a7460b9d5ec5b77dd21e15b7797bebe28377aa6
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_megadepth.py
@@ -0,0 +1,229 @@
+# --------------------------------------------------------
+# Preprocessing code for the MegaDepth dataset
+# dataset at https://www.cs.cornell.edu/projects/megadepth/
+# --------------------------------------------------------
+import os
+import os.path as osp
+import collections
+from tqdm import tqdm
+import numpy as np
+
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2
+import h5py
+
+import path_to_root  # noqa
+from datasets_preprocess.utils.parallel import parallel_threads
+from datasets_preprocess.utils import cropping  # noqa
+
+
+def get_parser():
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--megadepth_dir", required=True)
+    parser.add_argument("--num_views", default=64, type=int)
+    parser.add_argument("--precomputed_sets", required=True)
+    parser.add_argument("--output_dir", default="data/dust3r_data/processed_megadepth")
+    return parser
+
+
+def main(db_root, pairs_path, output_dir, num_views):
+    os.makedirs(output_dir, exist_ok=True)
+
+    # load all pairs
+    data = np.load(pairs_path, allow_pickle=True)
+    scenes = data["scenes"]
+    images = data["images"]
+    sets = data["sets"]
+
+    # enumerate all unique images
+    todo = collections.defaultdict(set)
+    for line in sets:
+        for i in range(1, num_views + 1):
+            todo[line[0]].add(line[i])
+
+    # for each scene, load intrinsics and then parallel crops
+    for scene, im_idxs in tqdm(todo.items(), desc="Overall"):
+        scene, subscene = scenes[scene].split()
+        out_dir = osp.join(output_dir, scene, subscene)
+        os.makedirs(out_dir, exist_ok=True)
+
+        # load all camera params
+        _, pose_w2cam, intrinsics = _load_kpts_and_poses(
+            db_root, scene, subscene, intrinsics=True
+        )
+
+        in_dir = osp.join(db_root, scene, "dense" + subscene)
+        # args = [(in_dir, img, intrinsics[img], pose_w2cam[img], out_dir)
+        #         for img in [images[im_id] for im_id in im_idxs]]
+        args = [
+            (in_dir, img, intrinsics[img], pose_w2cam[img], out_dir)
+            for img in intrinsics.keys()
+            if os.path.exists(osp.join(in_dir, "imgs", img))
+        ]
+        parallel_threads(
+            resize_one_image,
+            args,
+            star_args=True,
+            front_num=0,
+            leave=False,
+            desc=f"{scene}/{subscene}",
+        )
+
+    # save pairs
+    print("Done! prepared all images in", output_dir)
+
+
+def resize_one_image(root, tag, K_pre_rectif, pose_w2cam, out_dir):
+    if osp.isfile(osp.join(out_dir, tag + ".npz")):
+        return
+
+    # load image
+    img = cv2.cvtColor(
+        cv2.imread(osp.join(root, "imgs", tag), cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB
+    )
+    H, W = img.shape[:2]
+
+    # load depth
+    with h5py.File(osp.join(root, "depths", osp.splitext(tag)[0] + ".h5"), "r") as hd5:
+        depthmap = np.asarray(hd5["depth"])
+
+    # rectify = undistort the intrinsics
+    imsize_pre, K_pre, distortion = K_pre_rectif
+    imsize_post = img.shape[1::-1]
+    K_post = cv2.getOptimalNewCameraMatrix(
+        K_pre,
+        distortion,
+        imsize_pre,
+        alpha=0,
+        newImgSize=imsize_post,
+        centerPrincipalPoint=True,
+    )[0]
+
+    # downscale
+    img_out, depthmap_out, intrinsics_out, R_in2out = _downscale_image(
+        K_post, img, depthmap, resolution_out=(800, 600)
+    )
+
+    # write everything
+    img_out.save(osp.join(out_dir, tag + ".jpg"), quality=90)
+    cv2.imwrite(osp.join(out_dir, tag + ".exr"), depthmap_out)
+
+    camout2world = np.linalg.inv(pose_w2cam)
+    camout2world[:3, :3] = camout2world[:3, :3] @ R_in2out.T
+    np.savez(
+        osp.join(out_dir, tag + ".npz"),
+        intrinsics=intrinsics_out,
+        cam2world=camout2world,
+    )
+
+
+def _downscale_image(camera_intrinsics, image, depthmap, resolution_out=(512, 384)):
+    H, W = image.shape[:2]
+    resolution_out = sorted(resolution_out)[:: +1 if W < H else -1]
+
+    image, depthmap, intrinsics_out = cropping.rescale_image_depthmap(
+        image, depthmap, camera_intrinsics, resolution_out, force=False
+    )
+    R_in2out = np.eye(3)
+
+    return image, depthmap, intrinsics_out, R_in2out
+
+
+def _load_kpts_and_poses(root, scene_id, subscene, z_only=False, intrinsics=False):
+    if intrinsics:
+        with open(
+            os.path.join(
+                root, scene_id, "sparse", "manhattan", subscene, "cameras.txt"
+            ),
+            "r",
+        ) as f:
+            raw = f.readlines()[3:]  # skip the header
+
+        camera_intrinsics = {}
+        for camera in raw:
+            camera = camera.split(" ")
+            width, height, focal, cx, cy, k0 = [float(elem) for elem in camera[2:]]
+            K = np.eye(3)
+            K[0, 0] = focal
+            K[1, 1] = focal
+            K[0, 2] = cx
+            K[1, 2] = cy
+            camera_intrinsics[int(camera[0])] = (
+                (int(width), int(height)),
+                K,
+                (k0, 0, 0, 0),
+            )
+
+    with open(
+        os.path.join(root, scene_id, "sparse", "manhattan", subscene, "images.txt"), "r"
+    ) as f:
+        raw = f.read().splitlines()[4:]  # skip the header
+
+    extract_pose = (
+        colmap_raw_pose_to_principal_axis if z_only else colmap_raw_pose_to_RT
+    )
+
+    poses = {}
+    points3D_idxs = {}
+    camera = []
+
+    for image, points in zip(raw[::2], raw[1::2]):
+        image = image.split(" ")
+        points = points.split(" ")
+
+        image_id = image[-1]
+        camera.append(int(image[-2]))
+
+        # find the principal axis
+        raw_pose = [float(elem) for elem in image[1:-2]]
+        poses[image_id] = extract_pose(raw_pose)
+
+        current_points3D_idxs = {int(i) for i in points[2::3] if i != "-1"}
+        assert -1 not in current_points3D_idxs, bb()
+        points3D_idxs[image_id] = current_points3D_idxs
+
+    if intrinsics:
+        image_intrinsics = {
+            im_id: camera_intrinsics[cam] for im_id, cam in zip(poses, camera)
+        }
+        return points3D_idxs, poses, image_intrinsics
+    else:
+        return points3D_idxs, poses
+
+
+def colmap_raw_pose_to_principal_axis(image_pose):
+    qvec = image_pose[:4]
+    qvec = qvec / np.linalg.norm(qvec)
+    w, x, y, z = qvec
+    z_axis = np.float32(
+        [2 * x * z - 2 * y * w, 2 * y * z + 2 * x * w, 1 - 2 * x * x - 2 * y * y]
+    )
+    return z_axis
+
+
+def colmap_raw_pose_to_RT(image_pose):
+    qvec = image_pose[:4]
+    qvec = qvec / np.linalg.norm(qvec)
+    w, x, y, z = qvec
+    R = np.array(
+        [
+            [1 - 2 * y * y - 2 * z * z, 2 * x * y - 2 * z * w, 2 * x * z + 2 * y * w],
+            [2 * x * y + 2 * z * w, 1 - 2 * x * x - 2 * z * z, 2 * y * z - 2 * x * w],
+            [2 * x * z - 2 * y * w, 2 * y * z + 2 * x * w, 1 - 2 * x * x - 2 * y * y],
+        ]
+    )
+    # principal_axis.append(R[2, :])
+    t = image_pose[4:7]
+    # World-to-Camera pose
+    current_pose = np.eye(4)
+    current_pose[:3, :3] = R
+    current_pose[:3, 3] = t
+    return current_pose
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.megadepth_dir, args.precomputed_sets, args.output_dir, args.num_views)
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_mp3d.py b/extern/CUT3R/datasets_preprocess/preprocess_mp3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..6176bafe83d609248a411a77ed07a93b550b9909
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_mp3d.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+"""
+Preprocess the Matterport3D (MP3D) dataset.
+
+This script reads camera parameters and overlap data from a configuration file,
+processes RGB images and corresponding depth images, adjusts camera poses using a
+conversion matrix, and then saves the processed images, depth maps, and camera
+metadata into separate output directories.
+
+Usage:
+    python preprocess_mp3d.py --root_dir /path/to/data_mp3d/v1/scans \
+                              --out_dir /path/to/processed_mp3d
+"""
+
+import os
+import numpy as np
+import cv2
+import shutil
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from tqdm import tqdm
+import argparse
+
+
+def process_image(args):
+    """
+    Process a single image: reads the RGB image and depth image, normalizes the depth,
+    adjusts the camera pose using a conversion matrix, and saves the processed outputs.
+
+    Parameters:
+      args: tuple containing
+         (i, paths, K, pose, img_dir, depth_dir, out_rgb_dir, out_depth_dir, out_cam_dir, R_conv)
+         where:
+           i             - the frame index
+           paths         - tuple of (depth filename, RGB filename)
+           K             - camera intrinsics matrix (3x3 NumPy array)
+           pose          - camera pose (4x4 NumPy array)
+           img_dir       - directory containing RGB images
+           depth_dir     - directory containing depth images
+           out_rgb_dir   - output directory for processed RGB images
+           out_depth_dir - output directory for processed depth maps
+           out_cam_dir   - output directory for processed camera metadata
+           R_conv        - a 4x4 conversion matrix (NumPy array)
+    Returns:
+      None if successful, or an error string if processing fails.
+    """
+    (
+        i,
+        paths,
+        K,
+        pose,
+        img_dir,
+        depth_dir,
+        out_rgb_dir,
+        out_depth_dir,
+        out_cam_dir,
+        R_conv,
+    ) = args
+
+    depth_path, img_path = paths
+    img_path_full = os.path.join(img_dir, img_path)
+    depth_path_full = os.path.join(depth_dir, depth_path)
+
+    try:
+        # Read depth image using OpenCV (assumed to be stored with 16-bit depth)
+        depth = cv2.imread(depth_path_full, cv2.IMREAD_ANYDEPTH).astype(np.float32)
+        depth = depth / 4000.0  # Normalize depth (adjust this factor as needed)
+
+        # Adjust the camera pose with the conversion matrix
+        pose_adjusted = pose @ R_conv
+
+        # Generate output filenames using a zero-padded frame index.
+        basename = f"{i:06d}"
+        out_img_path = os.path.join(out_rgb_dir, basename + ".png")
+        out_depth_path = os.path.join(out_depth_dir, basename + ".npy")
+        out_cam_path = os.path.join(out_cam_dir, basename + ".npz")
+
+        # Copy the RGB image.
+        shutil.copyfile(img_path_full, out_img_path)
+
+        # Save the depth map.
+        np.save(out_depth_path, depth)
+
+        # Save the camera intrinsics and adjusted pose.
+        np.savez(out_cam_path, intrinsics=K, pose=pose_adjusted)
+
+    except Exception as e:
+        return f"Error processing image {img_path}: {e}"
+
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Preprocess MP3D scans: convert and save RGB images, depth maps, and camera metadata."
+    )
+    parser.add_argument(
+        "--root_dir",
+        type=str,
+        default="/path/to/data_mp3d/v1/scans",
+        help="Root directory of the raw MP3D data.",
+    )
+    parser.add_argument(
+        "--out_dir",
+        type=str,
+        default="/path/to/processed_mp3d",
+        help="Output directory for processed MP3D data.",
+    )
+    args = parser.parse_args()
+
+    root = args.root_dir
+    out_dir = args.out_dir
+
+    # List sequence directories (each scan is stored as a separate directory).
+    seqs = sorted([d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))])
+
+    # Define a conversion matrix from MP3D to the desired coordinate system.
+    R_conv = np.array(
+        [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]], dtype=np.float32
+    )
+
+    for seq in tqdm(seqs, desc="Sequences"):
+        # The sequence directory structure assumes that images and depth files are stored
+        # under a subdirectory with the same name as the sequence.
+        seq_dir = os.path.join(root, seq, seq)
+
+        img_dir = os.path.join(seq_dir, "undistorted_color_images")
+        depth_dir = os.path.join(seq_dir, "undistorted_depth_images")
+        cam_file = os.path.join(seq_dir, "undistorted_camera_parameters", f"{seq}.conf")
+        overlap_file = os.path.join(seq_dir, "image_overlap_data", f"{seq}_iis.txt")
+
+        # Read overlap data and save it (optional).
+        overlap = []
+        with open(overlap_file, "r") as f:
+            for line in f:
+                parts = line.split()
+                overlap.append([int(parts[1]), int(parts[2]), float(parts[3])])
+        overlap = np.array(overlap)
+        os.makedirs(os.path.join(out_dir, seq), exist_ok=True)
+        np.save(os.path.join(out_dir, seq, "overlap.npy"), overlap)
+
+        # Read camera parameters from a configuration file.
+        intrinsics = []
+        camera_poses = []
+        image_files = []
+
+        with open(cam_file, "r") as file:
+            lines = file.readlines()
+        current_intrinsics = None
+        for line in lines:
+            parts = line.split()
+            if not parts:
+                continue
+            if parts[0] == "intrinsics_matrix":
+                # Extract intrinsic parameters.
+                fx, cx, fy, cy = (
+                    float(parts[1]),
+                    float(parts[3]),
+                    float(parts[5]),
+                    float(parts[6]),
+                )
+                current_intrinsics = np.array(
+                    [[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32
+                )
+            elif parts[0] == "scan":
+                # Read the image filenames and camera pose.
+                depth_image = parts[1]
+                color_image = parts[2]
+                image_files.append((depth_image, color_image))
+                matrix_values = list(map(float, parts[3:]))
+                camera_pose = np.array(matrix_values).reshape(4, 4)
+                camera_poses.append(camera_pose)
+                if current_intrinsics is not None:
+                    intrinsics.append(current_intrinsics.copy())
+
+        if not (len(image_files) == len(intrinsics) == len(camera_poses)):
+            print(f"Inconsistent data in sequence {seq}")
+            continue
+
+        # Prepare output directories.
+        out_rgb_dir = os.path.join(out_dir, seq, "rgb")
+        out_depth_dir = os.path.join(out_dir, seq, "depth")
+        out_cam_dir = os.path.join(out_dir, seq, "cam")
+        os.makedirs(out_rgb_dir, exist_ok=True)
+        os.makedirs(out_depth_dir, exist_ok=True)
+        os.makedirs(out_cam_dir, exist_ok=True)
+
+        tasks = []
+        for i, (paths, K, pose) in enumerate(
+            zip(image_files, intrinsics, camera_poses)
+        ):
+            args_task = (
+                i,
+                paths,
+                K,
+                pose,
+                img_dir,
+                depth_dir,
+                out_rgb_dir,
+                out_depth_dir,
+                out_cam_dir,
+                R_conv,
+            )
+            tasks.append(args_task)
+
+        num_workers = os.cpu_count() // 2
+        with ProcessPoolExecutor(max_workers=num_workers) as executor:
+            futures = {executor.submit(process_image, task): task[0] for task in tasks}
+            for future in tqdm(
+                as_completed(futures), total=len(futures), desc=f"Processing {seq}"
+            ):
+                error = future.result()
+                if error:
+                    print(error)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_mvimgnet.py b/extern/CUT3R/datasets_preprocess/preprocess_mvimgnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d5a688b4edbdbdcaf3169927f71cc2b9afca095
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_mvimgnet.py
@@ -0,0 +1,323 @@
+#!/usr/bin/env python3
+"""
+Preprocess the MVImgNet dataset.
+
+This script processes MVImgNet sequences by:
+  - Loading a sparse SFM reconstruction.
+  - Undistorting and rescaling RGB images.
+  - Converting COLMAP intrinsics between conventions.
+  - Saving the processed images and camera metadata.
+
+Usage:
+  python preprocess_mvimgnet.py --data_dir /path/to/MVImgNet_data \
+                                --pcd_dir /path/to/MVPNet \
+                                --output_dir /path/to/processed_mvimgnet
+"""
+
+import os
+import os.path as osp
+import argparse
+import numpy as np
+import open3d as o3d
+import pyrender
+import PIL.Image as Image
+import cv2
+import shutil
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+
+# Import your custom SFM processing function.
+from read_write_model import run  # Assumed to be available
+
+# Try to set up resampling filters from PIL.
+try:
+    lanczos = Image.Resampling.LANCZOS
+    bicubic = Image.Resampling.BICUBIC
+except AttributeError:
+    lanczos = Image.LANCZOS
+    bicubic = Image.BICUBIC
+
+# Conversion matrix from COLMAP (or OpenGL) to OpenCV conventions.
+OPENGL_TO_OPENCV = np.float32(
+    [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]]
+)
+
+
+# -----------------------------------------------------------------------------
+# Helper Classes and Functions
+# -----------------------------------------------------------------------------
+class ImageList:
+    """Convenience class to apply operations to a list of images."""
+
+    def __init__(self, images):
+        if not isinstance(images, (list, tuple)):
+            images = [images]
+        self.images = []
+        for image in images:
+            if not isinstance(image, Image.Image):
+                image = Image.fromarray(image)
+            self.images.append(image)
+
+    def __len__(self):
+        return len(self.images)
+
+    def to_pil(self):
+        return tuple(self.images) if len(self.images) > 1 else self.images[0]
+
+    @property
+    def size(self):
+        sizes = [im.size for im in self.images]
+        assert all(s == sizes[0] for s in sizes)
+        return sizes[0]
+
+    def resize(self, *args, **kwargs):
+        return ImageList([im.resize(*args, **kwargs) for im in self.images])
+
+    def crop(self, *args, **kwargs):
+        return ImageList([im.crop(*args, **kwargs) for im in self.images])
+
+
+def colmap_to_opencv_intrinsics(K):
+    """
+    Convert COLMAP intrinsics (with pixel centers at (0.5, 0.5)) to OpenCV convention.
+    """
+    K = K.copy()
+    K[0, 2] -= 0.5
+    K[1, 2] -= 0.5
+    return K
+
+
+def opencv_to_colmap_intrinsics(K):
+    """
+    Convert OpenCV intrinsics (with pixel centers at (0, 0)) to COLMAP convention.
+    """
+    K = K.copy()
+    K[0, 2] += 0.5
+    K[1, 2] += 0.5
+    return K
+
+
+def rescale_image_depthmap(
+    image, depthmap, camera_intrinsics, output_resolution, force=True
+):
+    """
+    Jointly rescale an image (and its depthmap) so that the output resolution is at least the desired value.
+
+    Args:
+      image: Input image (as a PIL.Image or compatible object).
+      depthmap: A corresponding depth map (or None).
+      camera_intrinsics: A 3x3 NumPy array of intrinsics.
+      output_resolution: (width, height) desired resolution.
+      force: If True, always rescale even if the image is smaller.
+
+    Returns:
+      Tuple of (rescaled image, rescaled depthmap, updated intrinsics).
+    """
+    image = ImageList(image)
+    input_resolution = np.array(image.size)  # (W, H)
+    output_resolution = np.array(output_resolution)
+    if depthmap is not None:
+        assert tuple(depthmap.shape[:2]) == image.size[::-1]
+    scale_final = max(output_resolution / image.size) + 1e-8
+    if scale_final >= 1 and not force:
+        return image.to_pil(), depthmap, camera_intrinsics
+    output_resolution = np.floor(input_resolution * scale_final).astype(int)
+    image = image.resize(
+        tuple(output_resolution), resample=lanczos if scale_final < 1 else bicubic
+    )
+    if depthmap is not None:
+        depthmap = cv2.resize(
+            depthmap, tuple(output_resolution), interpolation=cv2.INTER_NEAREST
+        )
+    camera_intrinsics = camera_matrix_of_crop(
+        camera_intrinsics, input_resolution, output_resolution, scaling=scale_final
+    )
+    return image.to_pil(), depthmap, camera_intrinsics
+
+
+def camera_matrix_of_crop(
+    input_camera_matrix,
+    input_resolution,
+    output_resolution,
+    scaling=1,
+    offset_factor=0.5,
+    offset=None,
+):
+    """
+    Update the camera intrinsics to account for a rescaling (or cropping) of the image.
+    """
+    margins = np.asarray(input_resolution) * scaling - output_resolution
+    assert np.all(margins >= 0.0)
+    if offset is None:
+        offset = offset_factor * margins
+    output_camera_matrix_colmap = opencv_to_colmap_intrinsics(input_camera_matrix)
+    output_camera_matrix_colmap[:2, :] *= scaling
+    output_camera_matrix_colmap[:2, 2] -= offset
+    output_camera_matrix = colmap_to_opencv_intrinsics(output_camera_matrix_colmap)
+    return output_camera_matrix
+
+
+def pose_from_qwxyz_txyz(elems):
+    """
+    Convert a quaternion (qw, qx, qy, qz) and translation (tx, ty, tz) to a 4x4 pose.
+    Returns the inverse of the computed pose (i.e. cam2world).
+    """
+    from scipy.spatial.transform import Rotation
+
+    qw, qx, qy, qz, tx, ty, tz = map(float, elems)
+    pose = np.eye(4)
+    pose[:3, :3] = Rotation.from_quat((qx, qy, qz, qw)).as_matrix()
+    pose[:3, 3] = (tx, ty, tz)
+    return np.linalg.inv(pose)
+
+
+def load_sfm(sfm_dir):
+    """
+    Load sparse SFM data from COLMAP output files.
+
+    Returns a tuple (img_idx, img_infos) where:
+      - img_idx: A dict mapping image filename to index.
+      - img_infos: A dict of image information (including intrinsics, file path, and camera pose).
+    """
+    with open(osp.join(sfm_dir, "cameras.txt"), "r") as f:
+        raw = f.read().splitlines()[3:]  # skip header
+    intrinsics = {}
+    for camera in raw:
+        camera = camera.split(" ")
+        intrinsics[int(camera[0])] = [camera[1]] + [float(x) for x in camera[2:]]
+    with open(osp.join(sfm_dir, "images.txt"), "r") as f:
+        raw = f.read().splitlines()
+        raw = [line for line in raw if not line.startswith("#")]
+    img_idx = {}
+    img_infos = {}
+    for image, points in zip(raw[0::2], raw[1::2]):
+        image = image.split(" ")
+        points = points.split(" ")
+        idx = image[0]
+        img_name = image[-1]
+        assert img_name not in img_idx, f"Duplicate image: {img_name}"
+        img_idx[img_name] = idx
+        current_points2D = {
+            int(i): (float(x), float(y))
+            for i, x, y in zip(points[2::3], points[0::3], points[1::3])
+            if i != "-1"
+        }
+        img_infos[idx] = dict(
+            intrinsics=intrinsics[int(image[-2])],
+            path=img_name,
+            frame_id=img_name,
+            cam_to_world=pose_from_qwxyz_txyz(image[1:-2]),
+            sparse_pts2d=current_points2D,
+        )
+    return img_idx, img_infos
+
+
+def undistort_images(intrinsics, rgb):
+    """
+    Given camera intrinsics (in COLMAP convention) and an RGB image, compute and return
+    the corresponding OpenCV intrinsics along with the (unchanged) image.
+    """
+    width = int(intrinsics[1])
+    height = int(intrinsics[2])
+    fx = intrinsics[3]
+    fy = intrinsics[4]
+    cx = intrinsics[5]
+    cy = intrinsics[6]
+    K = np.zeros([3, 3])
+    K[0, 0] = fx
+    K[0, 2] = cx
+    K[1, 1] = fy
+    K[1, 2] = cy
+    K[2, 2] = 1
+    return width, height, K, rgb
+
+
+# -----------------------------------------------------------------------------
+# Processing Functions
+# -----------------------------------------------------------------------------
+def process_sequence(category, obj, data_dir, output_dir):
+    """
+    Process a single sequence from MVImgNet.
+
+    Steps:
+      1. Load the point cloud (from the MVPNet directory) and create a mesh (using Pyrender) for visualization.
+      2. Load the SFM reconstruction from COLMAP files.
+      3. For each image in the SFM output:
+         a. Load the image.
+         b. Undistort and rescale it.
+         c. Update the camera intrinsics.
+         d. Save the processed image and camera metadata.
+    """
+
+    # Define directories.
+    seq_dir = osp.join(data_dir, "MVImgNet_by_categories", category, obj[:-4])
+    rgb_dir = osp.join(seq_dir, "images")
+    sfm_dir = osp.join(seq_dir, "sparse", "0")
+
+    output_scene_dir = osp.join(output_dir, f"{category}_{obj[:-4]}")
+    output_rgb_dir = osp.join(output_scene_dir, "rgb")
+    output_cam_dir = osp.join(output_scene_dir, "cam")
+    os.makedirs(output_rgb_dir, exist_ok=True)
+    os.makedirs(output_cam_dir, exist_ok=True)
+
+    # Run custom SFM processing.
+    run(sfm_dir, sfm_dir)
+    img_idx, img_infos = load_sfm(sfm_dir)
+
+    for imgname in img_idx:
+        idx = img_idx[imgname]
+        info = img_infos[idx]
+        rgb_path = osp.join(rgb_dir, info["path"])
+        if not osp.exists(rgb_path):
+            continue
+        rgb = np.array(Image.open(rgb_path))
+        _, _, K, rgb = undistort_images(info["intrinsics"], rgb)
+        intrinsics = colmap_to_opencv_intrinsics(K)
+        # Rescale image to a target resolution (e.g., 640x480) preserving aspect ratio.
+        image, _, intrinsics = rescale_image_depthmap(
+            rgb, None, intrinsics, (640, int(640 * 3.0 / 4))
+        )
+        intrinsics = opencv_to_colmap_intrinsics(intrinsics)
+        out_img_path = osp.join(output_rgb_dir, info["path"][:-3] + "jpg")
+        image.save(out_img_path)
+        out_cam_path = osp.join(output_cam_dir, info["path"][:-3] + "npz")
+        np.savez(out_cam_path, intrinsics=intrinsics, pose=info["cam_to_world"])
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Preprocess MVImgNet dataset: undistort, rescale images, and save camera parameters."
+    )
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        default="/path/to/MVImgNet_data",
+        help="Directory containing MVImgNet data (images and point clouds).",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="/path/to/processed_mvimgnet",
+        help="Directory where processed data will be saved.",
+    )
+    args = parser.parse_args()
+
+    data_dir = args.data_dir
+    output_dir = args.output_dir
+
+    # Get list of categories.
+    categories = sorted(
+        [
+            d
+            for d in os.listdir(osp.join(data_dir, "MVImgNet_by_categories"))
+            if osp.isdir(osp.join(data_dir, "MVImgNet_by_categories", d))
+        ]
+    )
+    for cat in categories:
+        objects = sorted(os.listdir(osp.join(data_dir, "MVImgNet_by_categories", cat)))
+        for obj in objects:
+            process_sequence(cat, obj, data_dir, output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_mvs_synth.py b/extern/CUT3R/datasets_preprocess/preprocess_mvs_synth.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3b953b0e3a25e8d96038e29e0e21d5257c9ef7d
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_mvs_synth.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""
+Preprocess the MVS Synth dataset.
+
+This script processes each sequence in a given dataset directory by:
+  - Reading the RGB image, EXR depth image, and JSON camera parameters.
+  - Computing the camera pose from the extrinsic matrix (with a conversion matrix applied).
+  - Creating a simple camera intrinsics matrix from the provided focal lengths and principal point.
+  - Copying the RGB image (as JPG), saving the depth (as a NumPy array), and saving the camera data (as a NPZ file).
+
+Usage:
+    python preprocess_mvs_synth.py --root_dir /path/to/data_mvs_synth/GTAV_720/ \
+                                   --out_dir /path/to/processed_mvs_synth \
+                                   --num_workers 32
+"""
+
+import os
+import shutil
+import json
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from tqdm import tqdm
+import numpy as np
+import cv2
+import argparse
+
+# Ensure OpenEXR support if needed
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+
+# Conversion matrix (example conversion, adjust if needed)
+R_conv = np.array(
+    [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32
+)
+
+
+def process_basename(seq, basename, root_dir, out_dir):
+    """
+    Process a single frame identified by 'basename' within a given sequence.
+
+    Reads the RGB image, depth (EXR) file, and camera parameters (JSON file),
+    computes the adjusted camera pose, builds the camera intrinsics matrix,
+    and saves the processed outputs.
+
+    Parameters:
+      seq (str): The sequence (subdirectory) name.
+      basename (str): The basename of the file (without extension).
+      root_dir (str): Root directory containing the raw data.
+      out_dir (str): Output directory where processed data will be saved.
+
+    Returns:
+      None on success, or an error string on failure.
+    """
+    try:
+        # Define input directories.
+        seq_dir = os.path.join(root_dir, seq)
+        img_dir = os.path.join(seq_dir, "images")
+        depth_dir = os.path.join(seq_dir, "depths")
+        cam_dir = os.path.join(seq_dir, "poses")
+
+        # Define input file paths.
+        img_path = os.path.join(img_dir, basename + ".png")
+        depth_path = os.path.join(depth_dir, basename + ".exr")
+        cam_path = os.path.join(cam_dir, basename + ".json")
+
+        # Define output directories.
+        out_seq_dir = os.path.join(out_dir, seq)
+        out_img_dir = os.path.join(out_seq_dir, "rgb")
+        out_depth_dir = os.path.join(out_seq_dir, "depth")
+        out_cam_dir = os.path.join(out_seq_dir, "cam")
+        os.makedirs(out_img_dir, exist_ok=True)
+        os.makedirs(out_depth_dir, exist_ok=True)
+        os.makedirs(out_cam_dir, exist_ok=True)
+
+        # Define output file paths.
+        out_img_path = os.path.join(out_img_dir, basename + ".jpg")
+        out_depth_path = os.path.join(out_depth_dir, basename + ".npy")
+        out_cam_path = os.path.join(out_cam_dir, basename + ".npz")
+
+        # Read and process camera parameters.
+        with open(cam_path, "r") as f:
+            cam_data = json.load(f)
+        c_x = cam_data["c_x"]
+        c_y = cam_data["c_y"]
+        f_x = cam_data["f_x"]
+        f_y = cam_data["f_y"]
+        extrinsic = np.array(cam_data["extrinsic"])
+        # Invert extrinsic matrix to obtain camera-to-world pose.
+        pose = np.linalg.inv(extrinsic)
+        # Apply conversion matrix.
+        pose = R_conv @ pose
+
+        # Build a simple intrinsics matrix.
+        intrinsics = np.array(
+            [[f_x, 0, c_x], [0, f_y, c_y], [0, 0, 1]], dtype=np.float32
+        )
+
+        if np.any(np.isinf(pose)) or np.any(np.isnan(pose)):
+            raise ValueError(f"Invalid pose for {basename}")
+
+        # Read depth image.
+        depth = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH).astype(np.float32)
+        depth[np.isinf(depth)] = 0.0  # Clean up any infinite values
+
+        # Save the processed data.
+        shutil.copyfile(img_path, out_img_path)
+        np.save(out_depth_path, depth)
+        np.savez(out_cam_path, intrinsics=intrinsics, pose=pose)
+
+    except Exception as e:
+        return f"Error processing {seq}/{basename}: {e}"
+
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Preprocess MVS Synth dataset: convert images, depth, and camera data."
+    )
+    parser.add_argument(
+        "--root_dir",
+        type=str,
+        default="/path/to/data_mvs_synth/GTAV_720/",
+        help="Root directory of the raw MVS Synth data.",
+    )
+    parser.add_argument(
+        "--out_dir",
+        type=str,
+        default="/path/to/processed_mvs_synth",
+        help="Output directory for processed data.",
+    )
+    parser.add_argument(
+        "--num_workers", type=int, default=32, help="Number of parallel workers."
+    )
+    args = parser.parse_args()
+
+    root_dir = args.root_dir
+    out_dir = args.out_dir
+
+    # Get list of sequence directories.
+    seqs = sorted(
+        [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
+    )
+
+    # Pre-create output directories for each sequence.
+    for seq in seqs:
+        out_seq_dir = os.path.join(out_dir, seq)
+        os.makedirs(os.path.join(out_seq_dir, "rgb"), exist_ok=True)
+        os.makedirs(os.path.join(out_seq_dir, "depth"), exist_ok=True)
+        os.makedirs(os.path.join(out_seq_dir, "cam"), exist_ok=True)
+
+    # Build list of processing tasks.
+    tasks = []
+    for seq in seqs:
+        seq_dir = os.path.join(root_dir, seq)
+        img_dir = os.path.join(seq_dir, "images")
+        basenames = sorted([d[:-4] for d in os.listdir(img_dir) if d.endswith(".png")])
+        for basename in basenames:
+            tasks.append((seq, basename, root_dir, out_dir))
+
+    num_workers = args.num_workers
+    print(f"Processing {len(tasks)} tasks using {num_workers} workers...")
+
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        futures = {executor.submit(process_basename, *task): task[1] for task in tasks}
+        for future in tqdm(
+            as_completed(futures), total=len(futures), desc="Processing"
+        ):
+            error = future.result()
+            if error:
+                print(error)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_omniobject3d.py b/extern/CUT3R/datasets_preprocess/preprocess_omniobject3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..596e467a962307cc47d6830e6d05875df6687f98
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_omniobject3d.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+"""
+This script processes scene data by reading images, depth maps, and camera poses,
+computing camera intrinsics, and saving the results in a structured format.
+
+Usage:
+    python preprocess_omniobject3d.py --input_dir /path/to/input_root --output_dir /path/to/output_root
+"""
+
+import os
+import os.path as osp
+import json
+import argparse
+from concurrent.futures import ProcessPoolExecutor
+
+import numpy as np
+import cv2
+import imageio.v2 as imageio
+from tqdm import tqdm
+import math
+
+# Enable OpenEXR support in OpenCV
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+
+
+def prepare_scene_args(scene, input_root, output_root):
+    """
+    Prepare processing arguments for a given scene.
+
+    Args:
+        scene (str): Scene directory name.
+        input_root (str): Root directory for input data.
+        output_root (str): Root directory for output data.
+
+    Returns:
+        list or None: A list of arguments for each frame in the scene or None if preparation fails.
+    """
+    seq_dir = osp.join(input_root, scene, "render")
+    rgb_dir = osp.join(seq_dir, "images")
+    depth_dir = osp.join(seq_dir, "depths")
+    pose_file = osp.join(seq_dir, "transforms.json")
+    out_seq_dir = osp.join(output_root, scene)
+
+    # Check if the necessary file exists
+    if not osp.exists(pose_file):
+        print(f"Pose file not found: {pose_file}")
+        return None
+
+    # Load metadata from JSON
+    with open(pose_file, "r") as fp:
+        meta = json.load(fp)
+
+    camera_angle_x = float(meta["camera_angle_x"])
+
+    # Create output directories for this scene
+    os.makedirs(osp.join(out_seq_dir, "rgb"), exist_ok=True)
+    os.makedirs(osp.join(out_seq_dir, "depth"), exist_ok=True)
+    os.makedirs(osp.join(out_seq_dir, "cam"), exist_ok=True)
+
+    # Prepare a list of frame processing arguments
+    frame_args = [
+        (frame, camera_angle_x, rgb_dir, depth_dir, out_seq_dir)
+        for frame in meta.get("frames", [])
+    ]
+
+    return frame_args
+
+
+def process_frame(args):
+    """
+    Process a single frame:
+      - Reads the image and depth data.
+      - Handles alpha channels by compositing over a white background.
+      - Computes the camera intrinsics.
+      - Saves the processed RGB image, depth map, and camera parameters.
+
+    Args:
+        args (tuple): A tuple containing:
+            - frame (dict): Frame metadata.
+            - camera_angle_x (float): Camera field-of-view.
+            - rgb_dir (str): Directory containing RGB images.
+            - depth_dir (str): Directory containing depth maps.
+            - out_seq_dir (str): Output directory for the processed scene.
+    """
+    frame, camera_angle_x, rgb_dir, depth_dir, out_seq_dir = args
+
+    # Derive the base name from the frame's file path
+    frame_name = osp.basename(frame["file_path"])
+
+    # Define file paths for input and output
+    image_path = osp.join(rgb_dir, frame_name + ".png")
+    depth_path = osp.join(depth_dir, frame_name + "_depth.exr")
+    out_img_path = osp.join(out_seq_dir, "rgb", frame_name + ".png")
+    out_depth_path = osp.join(out_seq_dir, "depth", frame_name + ".npy")
+    out_cam_path = osp.join(out_seq_dir, "cam", frame_name + ".npz")
+
+    # Skip processing if outputs already exist
+    if (
+        osp.exists(out_img_path)
+        and osp.exists(out_depth_path)
+        and osp.exists(out_cam_path)
+    ):
+        return
+
+    # Read image using imageio
+    img = imageio.imread(image_path)
+
+    # If image has an alpha channel, composite it over a white background
+    if img.shape[-1] == 4:
+        alpha_channel = img[..., 3]
+        rgb_channels = img[..., :3]
+        white_background = np.full_like(rgb_channels, 255)
+        img = np.where(alpha_channel[..., None] == 0, white_background, rgb_channels)
+    else:
+        img = img[..., :3]
+
+    H, W, _ = img.shape
+
+    # Process the camera pose
+    pose = np.array(frame["transform_matrix"], dtype=np.float32)
+    pose[:, 1:3] *= -1  # Invert Y and Z axes if necessary
+
+    # Compute camera intrinsics using the provided camera angle
+    focal = 0.5 * W / np.tan(0.5 * camera_angle_x)
+    intrinsics = np.array(
+        [[focal, 0, W / 2], [0, focal, H / 2], [0, 0, 1]], dtype=np.float32
+    )
+
+    # Read depth data using OpenCV (which supports OpenEXR)
+    depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
+    if depth is None:
+        print(f"Warning: Depth file not found or failed to read: {depth_path}")
+        return
+
+    # Use the last channel of the depth data and convert to float32
+    depth = depth[..., -1].astype(np.float32)
+    depth[depth >= 65504.0] = 0.0  # Set invalid depth values to 0
+
+    # Save the processed outputs
+    imageio.imwrite(out_img_path, img.astype(np.uint8))
+    np.save(out_depth_path, depth)
+    np.savez_compressed(out_cam_path, intrinsics=intrinsics, pose=pose)
+
+
+def process_scene(frame_args):
+    """
+    Process all frames within a single scene.
+
+    Args:
+        frame_args (list): List of frame arguments for the scene.
+    """
+    if frame_args is None:
+        return
+
+    for args in frame_args:
+        process_frame(args)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Preprocess scene data by extracting RGB images, depth maps, and camera parameters."
+    )
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        required=True,
+        help="Path to the root input directory containing scene data.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        required=True,
+        help="Path to the directory where processed data will be saved.",
+    )
+    parser.add_argument(
+        "--max_workers",
+        type=int,
+        default=None,
+        help="Maximum number of worker processes. Defaults to the number of CPU cores.",
+    )
+    args = parser.parse_args()
+
+    input_root = args.input_dir
+    output_root = args.output_dir
+
+    # Ensure the output root directory exists
+    os.makedirs(output_root, exist_ok=True)
+
+    # List all scene directories in the input root
+    scenes = sorted(
+        [d for d in os.listdir(input_root) if osp.isdir(osp.join(input_root, d))]
+    )
+
+    # Determine the number of workers to use
+    max_workers = (
+        args.max_workers if args.max_workers is not None else os.cpu_count() or 1
+    )
+
+    # Prepare processing arguments for each scene in parallel
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        scene_args_list = list(
+            tqdm(
+                executor.map(
+                    lambda s: prepare_scene_args(s, input_root, output_root), scenes
+                ),
+                total=len(scenes),
+                desc="Preparing scenes",
+            )
+        )
+
+    # Filter out scenes where preparation failed
+    scene_frame_args = [fa for fa in scene_args_list if fa is not None]
+
+    # Process each scene in parallel
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        list(
+            tqdm(
+                executor.map(process_scene, scene_frame_args),
+                total=len(scene_frame_args),
+                desc="Processing scenes",
+            )
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_point_odyssey.py b/extern/CUT3R/datasets_preprocess/preprocess_point_odyssey.py
new file mode 100644
index 0000000000000000000000000000000000000000..b665d31f87c351862e10b99230a35ea049fa65f9
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_point_odyssey.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+"""
+Preprocess Script for Point Odyssey Dataset
+
+This script processes the Point Odyssey dataset by:
+  - Copying RGB images.
+  - Converting 16-bit depth images to a normalized float32 depth map.
+  - Inverting camera extrinsic matrices to obtain poses.
+  - Saving intrinsics and computed poses in a structured output directory.
+
+The dataset is expected to have subdirectories for each split (e.g., train, test, val),
+with each split containing multiple sequence directories. Each sequence directory must
+contain the following:
+  - An 'rgbs' folder with .jpg images.
+  - A 'depths' folder with .png depth images.
+  - An 'anno.npz' file with 'intrinsics' and 'extrinsics' arrays.
+
+Usage:
+    python preprocess_point_odyssey.py --input_dir /path/to/input_dataset --output_dir /path/to/output_dataset
+"""
+
+import os
+import argparse
+import shutil
+import numpy as np
+import cv2
+from tqdm import tqdm
+
+
+def process_sequence(seq_dir, out_seq_dir):
+    """
+    Process a single sequence:
+      - Verifies that required folders/files exist.
+      - Loads camera annotations.
+      - Processes each frame: copies the RGB image, processes the depth map,
+        computes the camera pose, and saves the results.
+
+    Args:
+        seq_dir (str): Directory of the sequence (should contain 'rgbs', 'depths', and 'anno.npz').
+        out_seq_dir (str): Output directory where processed files will be saved.
+    """
+    # Define input subdirectories and annotation file
+    img_dir = os.path.join(seq_dir, "rgbs")
+    depth_dir = os.path.join(seq_dir, "depths")
+    cam_file = os.path.join(seq_dir, "anno.npz")
+
+    # Ensure all necessary files/folders exist
+    if not (
+        os.path.exists(img_dir)
+        and os.path.exists(depth_dir)
+        and os.path.exists(cam_file)
+    ):
+        raise FileNotFoundError(f"Missing required data in {seq_dir}")
+
+    # Create output subdirectories for images, depth maps, and camera parameters
+    out_img_dir = os.path.join(out_seq_dir, "rgb")
+    out_depth_dir = os.path.join(out_seq_dir, "depth")
+    out_cam_dir = os.path.join(out_seq_dir, "cam")
+    os.makedirs(out_img_dir, exist_ok=True)
+    os.makedirs(out_depth_dir, exist_ok=True)
+    os.makedirs(out_cam_dir, exist_ok=True)
+
+    # Load camera annotations
+    annotations = np.load(cam_file)
+    cam_ints = annotations["intrinsics"].astype(np.float32)
+    cam_exts = annotations["extrinsics"].astype(np.float32)
+
+    # List and sort image and depth filenames
+    rgbs = sorted([f for f in os.listdir(img_dir) if f.endswith(".jpg")])
+    depths = sorted([f for f in os.listdir(depth_dir) if f.endswith(".png")])
+
+    # Ensure that the number of intrinsics, extrinsics, RGB images, and depth images match
+    if not (len(cam_ints) == len(cam_exts) == len(rgbs) == len(depths)):
+        raise ValueError(
+            f"Mismatch in sequence {seq_dir}: "
+            f"{len(cam_ints)} intrinsics, {len(cam_exts)} extrinsics, {len(rgbs)} images, {len(depths)} depths."
+        )
+
+    # Skip sequence if it has already been processed
+    if len(os.listdir(out_img_dir)) == len(rgbs):
+        return
+
+    # Process each frame in the sequence
+    for i in tqdm(range(len(cam_exts)), desc="Processing frames", leave=False):
+        # Extract frame index from filenames
+        basename_img = rgbs[i].split(".")[0].split("_")[-1]
+        basename_depth = depths[i].split(".")[0].split("_")[-1]
+        if int(basename_img) != i or int(basename_depth) != i:
+            raise ValueError(
+                f"Frame index mismatch in sequence {seq_dir} for frame {i}"
+            )
+
+        img_path = os.path.join(img_dir, rgbs[i])
+        depth_path = os.path.join(depth_dir, depths[i])
+
+        # Retrieve intrinsics and compute camera pose by inverting the extrinsic matrix
+        intrins = cam_ints[i]
+        cam_extrinsic = cam_exts[i]
+        pose = np.linalg.inv(cam_extrinsic)
+        if np.any(np.isinf(pose)) or np.any(np.isnan(pose)):
+            raise ValueError(
+                f"Invalid pose computed from extrinsics for frame {i} in {seq_dir}"
+            )
+
+        # Read and process depth image
+        depth_16bit = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH)
+        depth = depth_16bit.astype(np.float32) / 65535.0 * 1000.0
+
+        # Save processed files: copy the RGB image and save depth and camera parameters
+        basename = basename_img  # or str(i)
+        out_img_path = os.path.join(out_img_dir, basename + ".jpg")
+        shutil.copyfile(img_path, out_img_path)
+        np.save(os.path.join(out_depth_dir, basename + ".npy"), depth)
+        np.savez(
+            os.path.join(out_cam_dir, basename + ".npz"), intrinsics=intrins, pose=pose
+        )
+
+
+def process_split(split_dir, out_split_dir):
+    """
+    Process all sequences within a data split (e.g., train, test, or val).
+
+    Args:
+        split_dir (str): Directory for the split.
+        out_split_dir (str): Output directory for the processed split.
+    """
+    sequences = sorted(
+        [d for d in os.listdir(split_dir) if os.path.isdir(os.path.join(split_dir, d))]
+    )
+    for seq in tqdm(
+        sequences, desc=f"Processing sequences in {os.path.basename(split_dir)}"
+    ):
+        seq_dir = os.path.join(split_dir, seq)
+        out_seq_dir = os.path.join(out_split_dir, seq)
+        process_sequence(seq_dir, out_seq_dir)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Preprocess Point Odyssey dataset by processing images, depth maps, and camera parameters."
+    )
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        required=True,
+        help="Path to the root input dataset directory.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        required=True,
+        help="Path to the root output directory where processed data will be stored.",
+    )
+    args = parser.parse_args()
+
+    # Define the expected dataset splits
+    splits = ["train", "test", "val"]
+    for split in splits:
+        split_dir = os.path.join(args.input_dir, split)
+        out_split_dir = os.path.join(args.output_dir, split)
+        if not os.path.exists(split_dir):
+            print(
+                f"Warning: Split directory {split_dir} does not exist. Skipping this split."
+            )
+            continue
+        os.makedirs(out_split_dir, exist_ok=True)
+        process_split(split_dir, out_split_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_re10k.py b/extern/CUT3R/datasets_preprocess/preprocess_re10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..95edecf07e314049a38f8450481861664b671223
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_re10k.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+"""
+Usage:
+    python preprocess_re10k.py --root_dir /path/to/train \
+                             --info_dir /path/to/RealEstate10K/train \
+                             --out_dir /path/to/processed_re10k
+"""
+
+import os
+import shutil
+import argparse
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+
+def build_intrinsics(intrinsics_array, image_size):
+    """
+    Build a 3x3 camera intrinsics matrix from the given intrinsics array and image size.
+
+    Args:
+        intrinsics_array (np.ndarray): An array containing [fx_rel, fy_rel, cx_rel, cy_rel, ...].
+                                       We assume the first four components define focal and center
+                                       in normalized device coordinates (0..1).
+        image_size (tuple): The (width, height) of the image.
+
+    Returns:
+        np.ndarray: A 3x3 intrinsics matrix.
+    """
+    # focal_length = intrinsics[:2] * (width, height)
+    # principal_point = intrinsics[2:4] * (width, height)
+    width, height = image_size
+    fx_rel, fy_rel, cx_rel, cy_rel = intrinsics_array[:4]
+    fx = fx_rel * width
+    fy = fy_rel * height
+    cx = cx_rel * width
+    cy = cy_rel * height
+
+    K = np.eye(3, dtype=np.float64)
+    K[0, 0] = fx
+    K[1, 1] = fy
+    K[0, 2] = cx
+    K[1, 2] = cy
+
+    return K
+
+
+def compute_pose(extrinsics_array):
+    """
+    Compute the 4x4 pose matrix by inverting the 3x4 extrinsic matrix (plus a row [0, 0, 0, 1]).
+
+    Args:
+        extrinsics_array (np.ndarray): A 12-element array reshaped to (3,4) that
+                                       represents a camera-to-world or world-to-camera transform.
+
+    Returns:
+        np.ndarray: A 4x4 pose matrix (world-to-camera, or vice versa depending on your convention).
+    """
+    extrinsics_3x4 = extrinsics_array.reshape(3, 4)
+    extrinsics_4x4 = np.vstack([extrinsics_3x4, [0, 0, 0, 1]])
+    # Invert the extrinsics to get the pose
+    pose = np.linalg.inv(extrinsics_4x4)
+    return pose
+
+
+def process_frame(task):
+    """
+    Process a single frame:
+      - Reads the timestamp, intrinsics, and extrinsics.
+      - Copies the image to the output directory.
+      - Creates a .npz file containing camera intrinsics and the computed pose.
+
+    Args:
+        task (tuple): A tuple that contains:
+          (seq_dir, out_rgb_dir, out_cam_dir, raw_line).
+
+    Returns:
+        str or None:
+            A string with an error message if something fails; otherwise None on success.
+    """
+    seq_dir, out_rgb_dir, out_cam_dir, raw_line = task
+
+    try:
+        # Unpack the raw metadata line
+        # Format (assuming): [timestamp, fx_rel, fy_rel, cx_rel, cy_rel, <2 unused>, extrinsics...]
+        # Adjust as needed based on the real format of 'raw_line'.
+        timestamp = int(raw_line[0])
+        intrinsics_array = raw_line[1:7]
+        extrinsics_array = raw_line[7:]
+
+        img_name = f"{timestamp}.png"
+        src_img_path = os.path.join(seq_dir, img_name)
+        if not os.path.isfile(src_img_path):
+            return f"Image file not found: {src_img_path}"
+
+        # Derive output paths
+        out_img_path = os.path.join(out_rgb_dir, img_name)
+        out_cam_path = os.path.join(out_cam_dir, f"{timestamp}.npz")
+
+        # Skip if the camera file already exists
+        if os.path.isfile(out_cam_path):
+            return None
+
+        # Determine image size without loading the entire image
+        with Image.open(src_img_path) as img:
+            width, height = img.size
+
+        # Build the intrinsics matrix (K)
+        K = build_intrinsics(intrinsics_array, (width, height))
+
+        # Compute the pose matrix
+        pose = compute_pose(extrinsics_array)
+
+        # Copy the image to the output directory
+        shutil.copyfile(src_img_path, out_img_path)
+
+        # Save intrinsics and pose
+        np.savez(out_cam_path, intrinsics=K, pose=pose)
+
+    except Exception as e:
+        return f"Error processing frame for {seq_dir} at timestamp {timestamp}: {e}"
+
+    return None  # Success indicator
+
+
+def process_sequence(seq, root_dir, info_dir, out_dir):
+    """
+    Process a single sequence:
+      - Reads a metadata .txt file containing intrinsics and extrinsics for each frame.
+      - Prepares a list of tasks for parallel processing.
+
+    Args:
+        seq (str): Name of the sequence.
+        root_dir (str): Directory where the original sequence images (e.g., .png) are stored.
+        info_dir (str): Directory containing the .txt file with camera metadata for this sequence.
+        out_dir (str): Output directory where processed frames will be stored.
+    """
+    seq_dir = os.path.join(root_dir, seq)
+    scene_info_path = os.path.join(info_dir, f"{seq}.txt")
+
+    if not os.path.isfile(scene_info_path):
+        tqdm.write(f"Metadata file not found for sequence {seq} - skipping.")
+        return
+
+    # Load scene information
+    try:
+        # skiprows=1 if there's a header line in the .txt, adjust as needed
+        scene_info = np.loadtxt(
+            scene_info_path, delimiter=" ", dtype=np.float64, skiprows=1
+        )
+    except Exception as e:
+        tqdm.write(f"Error reading scene info for {seq}: {e}")
+        return
+
+    # Create output subdirectories
+    out_seq_dir = os.path.join(out_dir, seq)
+    out_rgb_dir = os.path.join(out_seq_dir, "rgb")
+    out_cam_dir = os.path.join(out_seq_dir, "cam")
+    os.makedirs(out_rgb_dir, exist_ok=True)
+    os.makedirs(out_cam_dir, exist_ok=True)
+
+    # Build tasks
+    tasks = [(seq_dir, out_rgb_dir, out_cam_dir, line) for line in scene_info]
+
+    # Process frames in parallel
+    with ProcessPoolExecutor(max_workers=os.cpu_count() // 2 or 1) as executor:
+        futures = {executor.submit(process_frame, t): t for t in tasks}
+        for future in as_completed(futures):
+            error_msg = future.result()
+            if error_msg:
+                tqdm.write(error_msg)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Process video frames and associated camera metadata."
+    )
+    parser.add_argument(
+        "--root_dir",
+        required=True,
+        help="Directory containing sequence folders with .png images.",
+    )
+    parser.add_argument(
+        "--info_dir", required=True, help="Directory containing metadata .txt files."
+    )
+    parser.add_argument(
+        "--out_dir", required=True, help="Output directory for processed data."
+    )
+    args = parser.parse_args()
+
+    # Gather a list of sequences (each sequence is a folder under root_dir)
+    if not os.path.isdir(args.root_dir):
+        raise FileNotFoundError(f"Root directory not found: {args.root_dir}")
+
+    seqs = [
+        d
+        for d in os.listdir(args.root_dir)
+        if os.path.isdir(os.path.join(args.root_dir, d))
+    ]
+    if not seqs:
+        raise ValueError(f"No sequence folders found in {args.root_dir}.")
+
+    # Process each sequence
+    for seq in tqdm(seqs, desc="Sequences"):
+        process_sequence(seq, args.root_dir, args.info_dir, args.out_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_scannet.py b/extern/CUT3R/datasets_preprocess/preprocess_scannet.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd8af83640975e39d0bdddd26d41c42be3934a99
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_scannet.py
@@ -0,0 +1,91 @@
+import argparse
+import random
+import gzip
+import json
+import os
+import os.path as osp
+
+import torch
+import PIL.Image
+from PIL import Image
+import numpy as np
+import cv2
+import multiprocessing
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import shutil
+import path_to_root  # noqa
+import datasets_preprocess.utils.cropping as cropping  # noqa
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--scannet_dir", default="data/data_scannet")
+    parser.add_argument("--output_dir", default="data/dust3r_data/processed_scannet")
+    return parser
+
+
+def process_scene(args):
+    rootdir, outdir, split, scene = args
+    frame_dir = osp.join(rootdir, split, scene)
+    rgb_dir = osp.join(frame_dir, "color")
+    depth_dir = osp.join(frame_dir, "depth")
+    pose_dir = osp.join(frame_dir, "pose")
+    depth_intrinsic = np.loadtxt(
+        osp.join(frame_dir, "intrinsic", "intrinsic_depth.txt")
+    )[:3, :3].astype(np.float32)
+    color_intrinsic = np.loadtxt(
+        osp.join(frame_dir, "intrinsic", "intrinsic_color.txt")
+    )[:3, :3].astype(np.float32)
+    if not np.isfinite(depth_intrinsic).all() or not np.isfinite(color_intrinsic).all():
+        return
+    os.makedirs(osp.join(outdir, split, scene), exist_ok=True)
+    frame_num = len(os.listdir(rgb_dir))
+    assert frame_num == len(os.listdir(depth_dir)) == len(os.listdir(pose_dir))
+    out_rgb_dir = osp.join(outdir, split, scene, "color")
+    out_depth_dir = osp.join(outdir, split, scene, "depth")
+    out_cam_dir = osp.join(outdir, split, scene, "cam")
+
+    os.makedirs(out_rgb_dir, exist_ok=True)
+    os.makedirs(out_depth_dir, exist_ok=True)
+    os.makedirs(out_cam_dir, exist_ok=True)
+    for i in tqdm(range(frame_num)):
+        rgb_path = osp.join(rgb_dir, f"{i}.jpg")
+        depth_path = osp.join(depth_dir, f"{i}.png")
+        pose_path = osp.join(pose_dir, f"{i}.txt")
+
+        rgb = Image.open(rgb_path)
+        depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
+        rgb = rgb.resize(depth.shape[::-1], resample=Image.Resampling.LANCZOS)
+        pose = np.loadtxt(pose_path).reshape(4, 4).astype(np.float32)
+        if not np.isfinite(pose).all():
+            continue
+
+        out_rgb_path = osp.join(out_rgb_dir, f"{i:05d}.jpg")
+        out_depth_path = osp.join(out_depth_dir, f"{i:05d}.png")
+        out_cam_path = osp.join(out_cam_dir, f"{i:05d}.npz")
+        np.savez(out_cam_path, intrinsics=depth_intrinsic, pose=pose)
+        rgb.save(out_rgb_path)
+        cv2.imwrite(out_depth_path, depth)
+
+
+def main(rootdir, outdir):
+    os.makedirs(outdir, exist_ok=True)
+    splits = ["scans_test", "scans_train"]
+    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
+
+    for split in splits:
+        scenes = [
+            f
+            for f in os.listdir(os.path.join(rootdir, split))
+            if os.path.isdir(osp.join(rootdir, split, f))
+        ]
+        pool.map(process_scene, [(rootdir, outdir, split, scene) for scene in scenes])
+    pool.close()
+    pool.join()
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.scannet_dir, args.output_dir)
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_scannetpp.py b/extern/CUT3R/datasets_preprocess/preprocess_scannetpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf24c795c25621ffbb6daaf69474763aefc32863
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_scannetpp.py
@@ -0,0 +1,477 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Script to pre-process the scannet++ dataset.
+# Usage:
+# python3 datasets_preprocess/preprocess_scannetpp.py --scannetpp_dir /path/to/scannetpp --precomputed_pairs /path/to/scannetpp_pairs --pyopengl-platform egl
+# --------------------------------------------------------
+import os
+import argparse
+import os.path as osp
+import re
+from tqdm import tqdm
+import json
+from scipy.spatial.transform import Rotation
+import pyrender
+import trimesh
+import trimesh.exchange.ply
+import numpy as np
+import cv2
+import PIL.Image as Image
+
+from datasets_preprocess.utils.cropping import rescale_image_depthmap
+import dust3r.utils.geometry as geometry
+
+inv = np.linalg.inv
+norm = np.linalg.norm
+REGEXPR_DSLR = re.compile(r"^DSC(?P<frameid>\d+).JPG$")
+REGEXPR_IPHONE = re.compile(r"frame_(?P<frameid>\d+).jpg$")
+
+DEBUG_VIZ = None  # 'iou'
+if DEBUG_VIZ is not None:
+    import matplotlib.pyplot as plt  # noqa
+
+
+OPENGL_TO_OPENCV = np.float32(
+    [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]]
+)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--scannetpp_dir", required=True)
+    parser.add_argument("--precomputed_pairs", required=True)
+    parser.add_argument("--output_dir", default="data/scannetpp_processed")
+    parser.add_argument(
+        "--target_resolution", default=920, type=int, help="images resolution"
+    )
+    parser.add_argument(
+        "--pyopengl-platform", type=str, default="", help="PyOpenGL env variable"
+    )
+    return parser
+
+
+def pose_from_qwxyz_txyz(elems):
+    qw, qx, qy, qz, tx, ty, tz = map(float, elems)
+    pose = np.eye(4)
+    pose[:3, :3] = Rotation.from_quat((qx, qy, qz, qw)).as_matrix()
+    pose[:3, 3] = (tx, ty, tz)
+    return np.linalg.inv(pose)  # returns cam2world
+
+
+def get_frame_number(name, cam_type="dslr"):
+    if cam_type == "dslr":
+        regex_expr = REGEXPR_DSLR
+    elif cam_type == "iphone":
+        regex_expr = REGEXPR_IPHONE
+    else:
+        raise NotImplementedError(f"wrong {cam_type=} for get_frame_number")
+    matches = re.match(regex_expr, name)
+    return matches["frameid"]
+
+
+def load_sfm(sfm_dir, cam_type="dslr"):
+    # load cameras
+    with open(osp.join(sfm_dir, "cameras.txt"), "r") as f:
+        raw = f.read().splitlines()[3:]  # skip header
+
+    intrinsics = {}
+    for camera in tqdm(raw, position=1, leave=False):
+        camera = camera.split(" ")
+        intrinsics[int(camera[0])] = [camera[1]] + [float(cam) for cam in camera[2:]]
+
+    # load images
+    with open(os.path.join(sfm_dir, "images.txt"), "r") as f:
+        raw = f.read().splitlines()
+        raw = [line for line in raw if not line.startswith("#")]  # skip header
+
+    img_idx = {}
+    img_infos = {}
+    for image, points in tqdm(
+        zip(raw[0::2], raw[1::2]), total=len(raw) // 2, position=1, leave=False
+    ):
+        image = image.split(" ")
+        points = points.split(" ")
+
+        idx = image[0]
+        img_name = image[-1]
+        assert img_name not in img_idx, "duplicate db image: " + img_name
+        img_idx[img_name] = idx  # register image name
+
+        current_points2D = {
+            int(i): (float(x), float(y))
+            for i, x, y in zip(points[2::3], points[0::3], points[1::3])
+            if i != "-1"
+        }
+        img_infos[idx] = dict(
+            intrinsics=intrinsics[int(image[-2])],
+            path=img_name,
+            frame_id=get_frame_number(img_name, cam_type),
+            cam_to_world=pose_from_qwxyz_txyz(image[1:-2]),
+            sparse_pts2d=current_points2D,
+        )
+
+    # load 3D points
+    with open(os.path.join(sfm_dir, "points3D.txt"), "r") as f:
+        raw = f.read().splitlines()
+        raw = [line for line in raw if not line.startswith("#")]  # skip header
+
+    points3D = {}
+    observations = {idx: [] for idx in img_infos.keys()}
+    for point in tqdm(raw, position=1, leave=False):
+        point = point.split()
+        point_3d_idx = int(point[0])
+        points3D[point_3d_idx] = tuple(map(float, point[1:4]))
+        if len(point) > 8:
+            for idx, point_2d_idx in zip(point[8::2], point[9::2]):
+                observations[idx].append((point_3d_idx, int(point_2d_idx)))
+
+    return img_idx, img_infos, points3D, observations
+
+
+def subsample_img_infos(img_infos, num_images, allowed_name_subset=None):
+    img_infos_val = [(idx, val) for idx, val in img_infos.items()]
+    if allowed_name_subset is not None:
+        img_infos_val = [
+            (idx, val)
+            for idx, val in img_infos_val
+            if val["path"] in allowed_name_subset
+        ]
+
+    if len(img_infos_val) > num_images:
+        img_infos_val = sorted(img_infos_val, key=lambda x: x[1]["frame_id"])
+        kept_idx = (
+            np.round(np.linspace(0, len(img_infos_val) - 1, num_images))
+            .astype(int)
+            .tolist()
+        )
+        img_infos_val = [img_infos_val[idx] for idx in kept_idx]
+    return {idx: val for idx, val in img_infos_val}
+
+
+def undistort_images(intrinsics, rgb, mask):
+    camera_type = intrinsics[0]
+
+    width = int(intrinsics[1])
+    height = int(intrinsics[2])
+    fx = intrinsics[3]
+    fy = intrinsics[4]
+    cx = intrinsics[5]
+    cy = intrinsics[6]
+    distortion = np.array(intrinsics[7:])
+
+    K = np.zeros([3, 3])
+    K[0, 0] = fx
+    K[0, 2] = cx
+    K[1, 1] = fy
+    K[1, 2] = cy
+    K[2, 2] = 1
+
+    K = geometry.colmap_to_opencv_intrinsics(K)
+    if camera_type == "OPENCV_FISHEYE":
+        assert len(distortion) == 4
+
+        new_K = cv2.fisheye.estimateNewCameraMatrixForUndistortRectify(
+            K,
+            distortion,
+            (width, height),
+            np.eye(3),
+            balance=0.0,
+        )
+        # Make the cx and cy to be the center of the image
+        new_K[0, 2] = width / 2.0
+        new_K[1, 2] = height / 2.0
+
+        map1, map2 = cv2.fisheye.initUndistortRectifyMap(
+            K, distortion, np.eye(3), new_K, (width, height), cv2.CV_32FC1
+        )
+    else:
+        new_K, _ = cv2.getOptimalNewCameraMatrix(
+            K, distortion, (width, height), 1, (width, height), True
+        )
+        map1, map2 = cv2.initUndistortRectifyMap(
+            K, distortion, np.eye(3), new_K, (width, height), cv2.CV_32FC1
+        )
+
+    undistorted_image = cv2.remap(
+        rgb,
+        map1,
+        map2,
+        interpolation=cv2.INTER_LINEAR,
+        borderMode=cv2.BORDER_REFLECT_101,
+    )
+    undistorted_mask = cv2.remap(
+        mask,
+        map1,
+        map2,
+        interpolation=cv2.INTER_LINEAR,
+        borderMode=cv2.BORDER_CONSTANT,
+        borderValue=255,
+    )
+    K = geometry.opencv_to_colmap_intrinsics(K)
+    return width, height, new_K, undistorted_image, undistorted_mask
+
+
+def process_scenes(root, pairsdir, output_dir, target_resolution):
+    os.makedirs(output_dir, exist_ok=True)
+
+    # default values from
+    # https://github.com/scannetpp/scannetpp/blob/main/common/configs/render.yml
+    znear = 0.05
+    zfar = 20.0
+
+    listfile = osp.join(pairsdir, "scene_list.json")
+    with open(listfile, "r") as f:
+        scenes = json.load(f)
+
+    # for each of these, we will select some dslr images and some iphone images
+    # we will undistort them and render their depth
+    renderer = pyrender.OffscreenRenderer(0, 0)
+    for scene in tqdm(scenes, position=0, leave=True):
+        data_dir = os.path.join(root, "data", scene)
+        dir_dslr = os.path.join(data_dir, "dslr")
+        dir_iphone = os.path.join(data_dir, "iphone")
+        dir_scans = os.path.join(data_dir, "scans")
+
+        assert (
+            os.path.isdir(data_dir)
+            and os.path.isdir(dir_dslr)
+            and os.path.isdir(dir_iphone)
+            and os.path.isdir(dir_scans)
+        )
+
+        output_dir_scene = os.path.join(output_dir, scene)
+        scene_metadata_path = osp.join(output_dir_scene, "scene_metadata.npz")
+        if osp.isfile(scene_metadata_path):
+            continue
+
+        pairs_dir_scene = os.path.join(pairsdir, scene)
+        pairs_dir_scene_selected_pairs = os.path.join(
+            pairs_dir_scene, "selected_pairs.npz"
+        )
+        assert osp.isfile(pairs_dir_scene_selected_pairs)
+        selected_npz = np.load(pairs_dir_scene_selected_pairs)
+        selection, pairs = selected_npz["selection"], selected_npz["pairs"]
+
+        # set up the output paths
+        output_dir_scene_rgb = os.path.join(output_dir_scene, "images")
+        output_dir_scene_depth = os.path.join(output_dir_scene, "depth")
+        os.makedirs(output_dir_scene_rgb, exist_ok=True)
+        os.makedirs(output_dir_scene_depth, exist_ok=True)
+
+        ply_path = os.path.join(dir_scans, "mesh_aligned_0.05.ply")
+
+        sfm_dir_dslr = os.path.join(dir_dslr, "colmap")
+        rgb_dir_dslr = os.path.join(dir_dslr, "resized_images")
+        mask_dir_dslr = os.path.join(dir_dslr, "resized_anon_masks")
+
+        sfm_dir_iphone = os.path.join(dir_iphone, "colmap")
+        rgb_dir_iphone = os.path.join(dir_iphone, "rgb")
+        mask_dir_iphone = os.path.join(dir_iphone, "rgb_masks")
+
+        # load the mesh
+        with open(ply_path, "rb") as f:
+            mesh_kwargs = trimesh.exchange.ply.load_ply(f)
+        mesh_scene = trimesh.Trimesh(**mesh_kwargs)
+
+        # read colmap reconstruction, we will only use the intrinsics and pose here
+        img_idx_dslr, img_infos_dslr, points3D_dslr, observations_dslr = load_sfm(
+            sfm_dir_dslr, cam_type="dslr"
+        )
+        dslr_paths = {
+            "in_colmap": sfm_dir_dslr,
+            "in_rgb": rgb_dir_dslr,
+            "in_mask": mask_dir_dslr,
+        }
+
+        img_idx_iphone, img_infos_iphone, points3D_iphone, observations_iphone = (
+            load_sfm(sfm_dir_iphone, cam_type="iphone")
+        )
+        iphone_paths = {
+            "in_colmap": sfm_dir_iphone,
+            "in_rgb": rgb_dir_iphone,
+            "in_mask": mask_dir_iphone,
+        }
+
+        mesh = pyrender.Mesh.from_trimesh(mesh_scene, smooth=False)
+        pyrender_scene = pyrender.Scene()
+        pyrender_scene.add(mesh)
+
+        selection_dslr = [
+            imgname + ".JPG" for imgname in selection if imgname.startswith("DSC")
+        ]
+        selection_iphone = [
+            imgname + ".jpg" for imgname in selection if imgname.startswith("frame_")
+        ]
+
+        # resize the image to a more manageable size and render depth
+        for selection_cam, img_idx, img_infos, paths_data in [
+            (selection_dslr, img_idx_dslr, img_infos_dslr, dslr_paths),
+            (selection_iphone, img_idx_iphone, img_infos_iphone, iphone_paths),
+        ]:
+            rgb_dir = paths_data["in_rgb"]
+            mask_dir = paths_data["in_mask"]
+            for imgname in tqdm(selection_cam, position=1, leave=False):
+                imgidx = img_idx[imgname]
+                img_infos_idx = img_infos[imgidx]
+                rgb = np.array(Image.open(os.path.join(rgb_dir, img_infos_idx["path"])))
+                mask = np.array(
+                    Image.open(
+                        os.path.join(mask_dir, img_infos_idx["path"][:-3] + "png")
+                    )
+                )
+
+                _, _, K, rgb, mask = undistort_images(
+                    img_infos_idx["intrinsics"], rgb, mask
+                )
+
+                # rescale_image_depthmap assumes opencv intrinsics
+                intrinsics = geometry.colmap_to_opencv_intrinsics(K)
+                image, mask, intrinsics = rescale_image_depthmap(
+                    rgb,
+                    mask,
+                    intrinsics,
+                    (target_resolution, target_resolution * 3.0 / 4),
+                )
+
+                W, H = image.size
+                intrinsics = geometry.opencv_to_colmap_intrinsics(intrinsics)
+
+                # update inpace img_infos_idx
+                img_infos_idx["intrinsics"] = intrinsics
+                rgb_outpath = os.path.join(
+                    output_dir_scene_rgb, img_infos_idx["path"][:-3] + "jpg"
+                )
+                image.save(rgb_outpath)
+
+                depth_outpath = os.path.join(
+                    output_dir_scene_depth, img_infos_idx["path"][:-3] + "png"
+                )
+                # render depth image
+                renderer.viewport_width, renderer.viewport_height = W, H
+                fx, fy, cx, cy = (
+                    intrinsics[0, 0],
+                    intrinsics[1, 1],
+                    intrinsics[0, 2],
+                    intrinsics[1, 2],
+                )
+                camera = pyrender.camera.IntrinsicsCamera(
+                    fx, fy, cx, cy, znear=znear, zfar=zfar
+                )
+                camera_node = pyrender_scene.add(
+                    camera, pose=img_infos_idx["cam_to_world"] @ OPENGL_TO_OPENCV
+                )
+
+                depth = renderer.render(
+                    pyrender_scene, flags=pyrender.RenderFlags.DEPTH_ONLY
+                )
+                pyrender_scene.remove_node(camera_node)  # dont forget to remove camera
+
+                depth = (depth * 1000).astype("uint16")
+                # invalidate depth from mask before saving
+                depth_mask = mask < 255
+                depth[depth_mask] = 0
+                Image.fromarray(depth).save(depth_outpath)
+
+        trajectories = []
+        intrinsics = []
+        for imgname in selection:
+            if imgname.startswith("DSC"):
+                imgidx = img_idx_dslr[imgname + ".JPG"]
+                img_infos_idx = img_infos_dslr[imgidx]
+            elif imgname.startswith("frame_"):
+                imgidx = img_idx_iphone[imgname + ".jpg"]
+                img_infos_idx = img_infos_iphone[imgidx]
+            else:
+                raise ValueError("invalid image name")
+
+            intrinsics.append(img_infos_idx["intrinsics"])
+            trajectories.append(img_infos_idx["cam_to_world"])
+
+        intrinsics = np.stack(intrinsics, axis=0)
+        trajectories = np.stack(trajectories, axis=0)
+        # save metadata for this scene
+        np.savez(
+            scene_metadata_path,
+            trajectories=trajectories,
+            intrinsics=intrinsics,
+            images=selection,
+            pairs=pairs,
+        )
+
+        del img_infos
+        del pyrender_scene
+
+    # concat all scene_metadata.npz into a single file
+    scene_data = {}
+    for scene_subdir in scenes:
+        scene_metadata_path = osp.join(output_dir, scene_subdir, "scene_metadata.npz")
+        with np.load(scene_metadata_path) as data:
+            trajectories = data["trajectories"]
+            intrinsics = data["intrinsics"]
+            images = data["images"]
+            pairs = data["pairs"]
+        scene_data[scene_subdir] = {
+            "trajectories": trajectories,
+            "intrinsics": intrinsics,
+            "images": images,
+            "pairs": pairs,
+        }
+
+    offset = 0
+    counts = []
+    scenes = []
+    sceneids = []
+    images = []
+    intrinsics = []
+    trajectories = []
+    pairs = []
+    for scene_idx, (scene_subdir, data) in enumerate(scene_data.items()):
+        num_imgs = data["images"].shape[0]
+        img_pairs = data["pairs"]
+
+        scenes.append(scene_subdir)
+        sceneids.extend([scene_idx] * num_imgs)
+
+        images.append(data["images"])
+
+        intrinsics.append(data["intrinsics"])
+        trajectories.append(data["trajectories"])
+
+        # offset pairs
+        img_pairs[:, 0:2] += offset
+        pairs.append(img_pairs)
+        counts.append(offset)
+
+        offset += num_imgs
+
+    images = np.concatenate(images, axis=0)
+    intrinsics = np.concatenate(intrinsics, axis=0)
+    trajectories = np.concatenate(trajectories, axis=0)
+    pairs = np.concatenate(pairs, axis=0)
+    np.savez(
+        osp.join(output_dir, "all_metadata.npz"),
+        counts=counts,
+        scenes=scenes,
+        sceneids=sceneids,
+        images=images,
+        intrinsics=intrinsics,
+        trajectories=trajectories,
+        pairs=pairs,
+    )
+    print("all done")
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    if args.pyopengl_platform.strip():
+        os.environ["PYOPENGL_PLATFORM"] = args.pyopengl_platform
+    process_scenes(
+        args.scannetpp_dir,
+        args.precomputed_pairs,
+        args.output_dir,
+        args.target_resolution,
+    )
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_smartportraits.py b/extern/CUT3R/datasets_preprocess/preprocess_smartportraits.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bd7178fb05e83084b25d6e59ad692eb8e6975ec
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_smartportraits.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+"""
+Preprocess Script for SmartPortraits Dataset
+
+This script processes each sequence in a specified input directory. Each sequence must contain:
+  - An "association.txt" file listing (timestamp_rgb, rgb_filename, timestamp_depth, depth_filename)
+  - Pairs of .png files (one for RGB and one for depth)
+
+The script copies each RGB .png file to an output "rgb" folder and converts each 16-bit depth
+image to a float32 .npy file in an output "depth" folder. It runs in parallel using
+ProcessPoolExecutor for faster performance on multi-core systems.
+
+Usage:
+    python preprocess_smartportraits.py \
+        --input_dir /path/to/processed_smartportraits1 \
+        --output_dir /path/to/processed_smartportraits
+"""
+
+import os
+import shutil
+import argparse
+import numpy as np
+import cv2
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+
+def process_pair(args):
+    """
+    Process a single (RGB, depth) pair by:
+      - Reading the depth .png file and converting it to float32 (depth_in_meters = depth_val / 5000).
+      - Copying the RGB file to the output directory.
+      - Saving the converted depth to a .npy file.
+
+    Args:
+        args (tuple): A tuple containing:
+            - seq_dir (str): Path to the sequence directory.
+            - seq (str): The name of the current sequence.
+            - pair_index (int): Index of the pair in the association file (for naming outputs).
+            - pair (tuple): (rgb_filename, depth_filename).
+            - out_rgb_dir (str): Output directory for RGB images.
+            - out_depth_dir (str): Output directory for depth .npy files.
+
+    Returns:
+        None or str:
+            - Returns None upon successful processing.
+            - Returns an error message (str) if something fails.
+    """
+    seq_dir, seq, pair_index, pair, out_rgb_dir, out_depth_dir = args
+    out_rgb_path = os.path.join(out_rgb_dir, f"{pair_index:06d}.png")
+    out_depth_path = os.path.join(out_depth_dir, f"{pair_index:06d}.npy")
+
+    # Skip if both output files already exist
+    if os.path.exists(out_rgb_path) and os.path.exists(out_depth_path):
+        return None
+
+    try:
+        rgb_path = os.path.join(seq_dir, pair[0])
+        depth_path = os.path.join(seq_dir, pair[1])
+
+        if not os.path.isfile(rgb_path):
+            return f"RGB image not found: {rgb_path}"
+        if not os.path.isfile(depth_path):
+            return f"Depth image not found: {depth_path}"
+
+        # Read the 16-bit depth file
+        depth = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH)
+        if depth is None:
+            return f"Failed to read depth image: {depth_path}"
+
+        # Convert depth values to float32, scale by 1/5000
+        depth = depth.astype(np.float32) / 5000.0
+
+        # Copy the RGB image
+        shutil.copyfile(rgb_path, out_rgb_path)
+
+        # Save depth as a .npy file
+        np.save(out_depth_path, depth)
+
+    except Exception as e:
+        return f"Error processing pair {pair_index} in sequence '{seq}': {e}"
+
+    return None
+
+
+def process_sequence(seq, input_dir, output_dir):
+    """
+    Process all (RGB, depth) pairs within a single sequence directory.
+
+    Args:
+        seq (str): Name of the sequence (subdirectory).
+        input_dir (str): Base input directory containing all sequences.
+        output_dir (str): Base output directory where processed data will be stored.
+    """
+    seq_dir = os.path.join(input_dir, seq)
+    assoc_file = os.path.join(seq_dir, "association.txt")
+
+    # If the association file does not exist, skip this sequence
+    if not os.path.isfile(assoc_file):
+        tqdm.write(f"No association.txt found for sequence {seq}. Skipping.")
+        return
+
+    # Prepare output directories
+    out_rgb_dir = os.path.join(output_dir, seq, "rgb")
+    out_depth_dir = os.path.join(output_dir, seq, "depth")
+    os.makedirs(out_rgb_dir, exist_ok=True)
+    os.makedirs(out_depth_dir, exist_ok=True)
+
+    # Read the association file
+    pairs = []
+    with open(assoc_file, "r") as f:
+        for line in f:
+            items = line.strip().split()
+            # Format: <timestamp_rgb> <rgb_filename> <timestamp_depth> <depth_filename>
+            if len(items) < 4:
+                continue
+            rgb_file = items[1]
+            depth_file = items[3]
+            pairs.append((rgb_file, depth_file))
+
+    # Build a list of tasks for parallel processing
+    tasks = []
+    for i, pair in enumerate(pairs):
+        task_args = (seq_dir, seq, i, pair, out_rgb_dir, out_depth_dir)
+        tasks.append(task_args)
+
+    # Process pairs in parallel
+    num_workers = max(1, os.cpu_count() or 1)
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        futures = {executor.submit(process_pair, t): t for t in tasks}
+        for future in tqdm(
+            as_completed(futures), total=len(futures), desc=f"Processing sequence {seq}"
+        ):
+            error = future.result()
+            if error:
+                tqdm.write(error)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Preprocess SmartPortraits dataset.")
+    parser.add_argument(
+        "--input_dir",
+        required=True,
+        help="Path to the directory containing all sequences with association.txt files.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        required=True,
+        help="Path to the directory where processed results will be saved.",
+    )
+    args = parser.parse_args()
+
+    # Gather sequences
+    if not os.path.isdir(args.input_dir):
+        raise ValueError(f"Input directory not found: {args.input_dir}")
+
+    seqs = sorted(
+        [
+            d
+            for d in os.listdir(args.input_dir)
+            if os.path.isdir(os.path.join(args.input_dir, d))
+        ]
+    )
+
+    if not seqs:
+        raise ValueError(f"No valid subdirectories found in {args.input_dir}")
+
+    # Process each sequence
+    for seq in tqdm(seqs, desc="Sequences"):
+        process_sequence(seq, args.input_dir, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_spring.py b/extern/CUT3R/datasets_preprocess/preprocess_spring.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dd3e9c0ee5efb31ebb04acab95bb8fc4a94ee9e
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_spring.py
@@ -0,0 +1,318 @@
+#!/usr/bin/env python3
+"""
+Preprocessing Script for Spring Dataset
+
+This script:
+  - Recursively processes each sequence in a given 'root_dir' for the Spring dataset.
+  - Reads RGB, disparity, optical flow files, and camera intrinsics/extrinsics.
+  - Converts disparity to depth, rescales images/flows, and writes processed results
+    (RGB, Depth, Cam intrinsics/poses, Forward Flow, Backward Flow) to 'out_dir'.
+
+Usage:
+    python preprocess_spring.py \
+        --root_dir /path/to/spring/train \
+        --out_dir /path/to/processed_spring \
+        --baseline 0.065 \
+        --output_size 960 540
+
+"""
+
+import os
+import argparse
+import numpy as np
+import cv2
+from PIL import Image
+import shutil
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+# Custom modules (adapt these imports to your actual module locations)
+import flow_IO
+import src.dust3r.datasets.utils.cropping as cropping
+
+
+def rescale_flow(flow, size):
+    """
+    Resize an optical flow field to a new resolution and scale its vectors accordingly.
+
+    Args:
+        flow (np.ndarray): Flow array of shape [H, W, 2].
+        size (tuple): Desired (width, height) for the resized flow.
+
+    Returns:
+        np.ndarray: Resized and scaled flow array.
+    """
+    h, w = flow.shape[:2]
+    new_w, new_h = size
+
+    # Resize the flow map
+    flow_resized = cv2.resize(
+        flow.astype("float32"), (new_w, new_h), interpolation=cv2.INTER_LINEAR
+    )
+
+    # Scale the flow vectors to match the new resolution
+    flow_resized[..., 0] *= new_w / w
+    flow_resized[..., 1] *= new_h / h
+
+    return flow_resized
+
+
+def get_depth(disparity, fx_baseline):
+    """
+    Convert disparity to depth using baseline * focal_length / disparity.
+
+    Args:
+        disparity (np.ndarray): Disparity map (same resolution as the RGB).
+        fx_baseline (float): Product of the focal length (fx) and baseline.
+
+    Returns:
+        np.ndarray: Depth map.
+    """
+    # Avoid divide-by-zero
+    depth = np.zeros_like(disparity, dtype=np.float32)
+    valid_mask = disparity != 0
+    depth[valid_mask] = fx_baseline / disparity[valid_mask]
+    return depth
+
+
+def process_sequence(seq, root_dir, out_dir, baseline, output_size):
+    """
+    Process a single sequence from the Spring dataset:
+      - Reads RGB frames, disparity maps, forward/backward optical flow, intrinsics, extrinsics.
+      - Converts disparity to depth.
+      - Rescales images, depth, and flow to the specified 'output_size'.
+      - Saves the processed data to the output directory.
+
+    Args:
+        seq (str): Name of the sequence (subdirectory).
+        root_dir (str): Root directory containing the Spring dataset sequences.
+        out_dir (str): Output directory to store processed files.
+        baseline (float): Stereo baseline for disparity-to-depth conversion (SPRING_BASELINE).
+        output_size (tuple): (width, height) for output images and flows.
+
+    Returns:
+        None or str:
+            - Returns None if processing is successful.
+            - Returns an error message (str) if an error occurs.
+    """
+    seq_dir = os.path.join(root_dir, seq)
+    img_dir = os.path.join(seq_dir, "frame_left")
+    disp1_dir = os.path.join(seq_dir, "disp1_left")
+    fflow_dir = os.path.join(seq_dir, "flow_FW_left")
+    bflow_dir = os.path.join(seq_dir, "flow_BW_left")
+    intrinsics_path = os.path.join(seq_dir, "cam_data", "intrinsics.txt")
+    extrinsics_path = os.path.join(seq_dir, "cam_data", "extrinsics.txt")
+
+    try:
+        # Check required files/folders
+        for path in (
+            img_dir,
+            disp1_dir,
+            fflow_dir,
+            bflow_dir,
+            intrinsics_path,
+            extrinsics_path,
+        ):
+            if not os.path.exists(path):
+                return f"Missing required path: {path}"
+
+        # Prepare output directories
+        out_img_dir = os.path.join(out_dir, seq, "rgb")
+        out_depth_dir = os.path.join(out_dir, seq, "depth")
+        out_cam_dir = os.path.join(out_dir, seq, "cam")
+        out_fflow_dir = os.path.join(out_dir, seq, "flow_forward")
+        out_bflow_dir = os.path.join(out_dir, seq, "flow_backward")
+        for d in [
+            out_img_dir,
+            out_depth_dir,
+            out_cam_dir,
+            out_fflow_dir,
+            out_bflow_dir,
+        ]:
+            os.makedirs(d, exist_ok=True)
+
+        # Read camera data
+        all_intrinsics = np.loadtxt(intrinsics_path)
+        all_extrinsics = np.loadtxt(extrinsics_path)
+
+        # Collect filenames
+        rgbs = sorted([f for f in os.listdir(img_dir) if f.endswith(".png")])
+        disps = sorted([f for f in os.listdir(disp1_dir) if f.endswith(".dsp5")])
+        fflows = sorted([f for f in os.listdir(fflow_dir) if f.endswith(".flo5")])
+        bflows = sorted([f for f in os.listdir(bflow_dir) if f.endswith(".flo5")])
+
+        # Basic consistency check
+        if not (len(all_intrinsics) == len(all_extrinsics) == len(rgbs) == len(disps)):
+            return (
+                f"Inconsistent lengths in {seq}: "
+                f"Intrinsics {len(all_intrinsics)}, "
+                f"Extrinsics {len(all_extrinsics)}, "
+                f"RGBs {len(rgbs)}, "
+                f"Disparities {len(disps)}"
+            )
+        # Note: fflows+1 == len(all_intrinsics), bflows+1 == len(all_intrinsics)
+
+        # Check if already processed
+        if len(os.listdir(out_img_dir)) == len(rgbs):
+            return None  # Already done, skip
+
+        # Process each frame
+        for i in tqdm(
+            range(len(all_intrinsics)), desc=f"Processing {seq}", leave=False
+        ):
+            frame_num = i + 1  # frames appear as 1-based in filenames
+            img_path = os.path.join(img_dir, f"frame_left_{frame_num:04d}.png")
+            disp1_path = os.path.join(disp1_dir, f"disp1_left_{frame_num:04d}.dsp5")
+            fflow_path = None
+            bflow_path = None
+
+            if i < len(all_intrinsics) - 1:
+                fflow_path = os.path.join(
+                    fflow_dir, f"flow_FW_left_{frame_num:04d}.flo5"
+                )
+            if i > 0:
+                bflow_path = os.path.join(
+                    bflow_dir, f"flow_BW_left_{frame_num:04d}.flo5"
+                )
+
+            # Load image
+            image = Image.open(img_path).convert("RGB")
+
+            # Build the intrinsics matrix
+            K = np.eye(3, dtype=np.float32)
+            K[0, 0] = all_intrinsics[i][0]  # fx
+            K[1, 1] = all_intrinsics[i][1]  # fy
+            K[0, 2] = all_intrinsics[i][2]  # cx
+            K[1, 2] = all_intrinsics[i][3]  # cy
+
+            # Build the pose
+            cam_ext = all_extrinsics[i].reshape(4, 4)
+            pose = np.linalg.inv(cam_ext).astype(np.float32)
+            if np.any(np.isinf(pose)) or np.any(np.isnan(pose)):
+                return f"Invalid pose for frame {i} in {seq}"
+
+            # Load disparity
+            disp1 = flow_IO.readDispFile(disp1_path)
+            # Subsample by 2
+            disp1 = disp1[::2, ::2]
+
+            # Convert disparity to depth
+            fx_baseline = all_intrinsics[i][0] * baseline  # fx * baseline
+            depth = get_depth(disp1, fx_baseline)
+            depth[np.isinf(depth)] = 0.0
+            depth[np.isnan(depth)] = 0.0
+
+            # Load optical flows if available
+            fflow = None
+            bflow = None
+            if fflow_path and os.path.exists(fflow_path):
+                fflow = flow_IO.readFlowFile(fflow_path)
+                fflow = fflow[::2, ::2]
+            if bflow_path and os.path.exists(bflow_path):
+                bflow = flow_IO.readFlowFile(bflow_path)
+                bflow = bflow[::2, ::2]
+
+            # Rescale image, depth, and intrinsics
+            image, depth, K_scaled = cropping.rescale_image_depthmap(
+                image, depth, K, output_size
+            )
+            W_new, H_new = image.size  # after rescale_image_depthmap
+
+            # Rescale forward/backward flow
+            if fflow is not None:
+                fflow = rescale_flow(fflow, (W_new, H_new))
+            if bflow is not None:
+                bflow = rescale_flow(bflow, (W_new, H_new))
+
+            # Save output
+            out_index_str = f"{i:04d}"
+            out_img_path = os.path.join(out_img_dir, out_index_str + ".png")
+            image.save(out_img_path)
+
+            out_depth_path = os.path.join(out_depth_dir, out_index_str + ".npy")
+            np.save(out_depth_path, depth)
+
+            out_cam_path = os.path.join(out_cam_dir, out_index_str + ".npz")
+            np.savez(out_cam_path, intrinsics=K_scaled, pose=pose)
+
+            if fflow is not None:
+                out_fflow_path = os.path.join(out_fflow_dir, out_index_str + ".npy")
+                np.save(out_fflow_path, fflow)
+            if bflow is not None:
+                out_bflow_path = os.path.join(out_bflow_dir, out_index_str + ".npy")
+                np.save(out_bflow_path, bflow)
+
+    except Exception as e:
+        return f"Error processing sequence {seq}: {e}"
+
+    return None  # success
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Preprocess Spring dataset.")
+    parser.add_argument(
+        "--root_dir",
+        required=True,
+        help="Path to the root directory containing Spring dataset sequences.",
+    )
+    parser.add_argument(
+        "--out_dir",
+        required=True,
+        help="Path to the output directory where processed files will be saved.",
+    )
+    parser.add_argument(
+        "--baseline",
+        type=float,
+        default=0.065,
+        help="Stereo baseline for disparity-to-depth conversion (default: 0.065).",
+    )
+    parser.add_argument(
+        "--output_size",
+        type=int,
+        nargs=2,
+        default=[960, 540],
+        help="Output image size (width height) for rescaling.",
+    )
+    args = parser.parse_args()
+
+    # Gather sequences
+    if not os.path.isdir(args.root_dir):
+        raise ValueError(f"Root directory not found: {args.root_dir}")
+    os.makedirs(args.out_dir, exist_ok=True)
+
+    seqs = sorted(
+        [
+            d
+            for d in os.listdir(args.root_dir)
+            if os.path.isdir(os.path.join(args.root_dir, d))
+        ]
+    )
+    if not seqs:
+        raise ValueError(f"No valid sequence folders found in {args.root_dir}")
+
+    # Process each sequence in parallel
+    with ProcessPoolExecutor(max_workers=os.cpu_count() // 2) as executor:
+        future_to_seq = {
+            executor.submit(
+                process_sequence,
+                seq,
+                args.root_dir,
+                args.out_dir,
+                args.baseline,
+                args.output_size,
+            ): seq
+            for seq in seqs
+        }
+        for future in tqdm(
+            as_completed(future_to_seq),
+            total=len(future_to_seq),
+            desc="Processing all sequences",
+        ):
+            seq = future_to_seq[future]
+            error = future.result()
+            if error:
+                print(f"Sequence '{seq}' failed: {error}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_synscapes.py b/extern/CUT3R/datasets_preprocess/preprocess_synscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..b85c4ab0a712b078e437256c4e0f639c122966c5
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_synscapes.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+"""
+Preprocess Synscapes Data
+
+This script processes Synscapes data by:
+  1. Copying the RGB images.
+  2. Reading the EXR depth data and saving it as .npy.
+  3. Generating a sky mask using the class labels.
+  4. Extracting camera intrinsics from the meta file.
+
+The directory structure is expected to be:
+    synscapes_dir/
+        img/
+            rgb/
+            depth/
+            class/
+        meta/
+    Each file shares the same base name, e.g. 000000.png/exr in corresponding folders.
+
+Usage:
+    python preprocess_synscapes.py \
+        --synscapes_dir /path/to/Synscapes/Synscapes \
+        --output_dir /path/to/processed_synscapes
+"""
+
+import os
+import json
+import shutil
+import argparse
+import numpy as np
+import cv2
+import OpenEXR
+from tqdm import tqdm
+
+# Enable EXR support in OpenCV if desired:
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+
+
+def process_basename(
+    basename,
+    rgb_dir,
+    depth_dir,
+    class_dir,
+    meta_dir,
+    out_rgb_dir,
+    out_depth_dir,
+    out_mask_dir,
+    out_cam_dir,
+    sky_id=23,
+):
+    """
+    Process a single sample of the Synscapes dataset:
+      1. Reads an RGB .png and depth .exr file.
+      2. Reads a class label .png, generating a sky mask.
+      3. Reads camera intrinsics from the meta .json file.
+      4. Saves the resulting data to the specified output folders.
+
+    Args:
+        basename (str): The base filename (without extension).
+        rgb_dir (str): Directory containing RGB .png files.
+        depth_dir (str): Directory containing depth .exr files.
+        class_dir (str): Directory containing class .png files.
+        meta_dir (str): Directory containing camera metadata .json files.
+        out_rgb_dir (str): Output directory for RGB files.
+        out_depth_dir (str): Output directory for depth .npy files.
+        out_mask_dir (str): Output directory for sky masks.
+        out_cam_dir (str): Output directory for camera intrinsics (.npz).
+        sky_id (int): Class ID for sky pixels in the class label images.
+
+    Returns:
+        None or str:
+            If an error occurs, returns an error message (str). Otherwise, returns None.
+    """
+    try:
+        # Input file paths
+        rgb_file = os.path.join(rgb_dir, f"{basename}.png")
+        depth_file = os.path.join(depth_dir, f"{basename}.exr")
+        class_file = os.path.join(class_dir, f"{basename}.png")
+        meta_file = os.path.join(meta_dir, f"{basename}.json")
+
+        # Output file paths
+        out_img_path = os.path.join(out_rgb_dir, f"{basename}.png")
+        out_depth_path = os.path.join(out_depth_dir, f"{basename}.npy")
+        out_mask_path = os.path.join(out_mask_dir, f"{basename}.png")
+        out_cam_path = os.path.join(out_cam_dir, f"{basename}.npz")
+
+        # --- Read Depth Data ---
+        # If you want to use OpenEXR directly (matching your code), do so here:
+        exr_file = OpenEXR.InputFile(depth_file)
+        # e.g. reading "Z" channel. Adjust channel name as needed.
+        # It's possible that the data is stored in multiple channels (R/G/B or separate "Z").
+        # Check your file structure to match the correct channel name.
+        # The snippet below is just an example approach using .parts and .channels.
+        # If your EXR file is a single-part file with a standard channel, you'd do something like:
+        #   depth = np.frombuffer(exr_file.channel('Z', Imath.PixelType(Imath.PixelType.FLOAT)), dtype=np.float32)
+        # The way you've shown "parts[0].channels['Z'].pixels" may or may not be valid for your version of PyOpenEXR.
+
+        # This example code is approximate and may need to be adapted:
+        # If your version of OpenEXR has a different interface, change accordingly.
+        # The snippet below won't work unless you install a specific PyOpenEXR wrapper that supports .parts, .channels, etc.
+        #
+        # For demonstration, let's assume a single-part EXR with channel 'Z':
+        # depth_data = exr_file.channel('Z')  # returns raw bytes
+        # depth = np.frombuffer(depth_data, dtype=np.float32).reshape((height, width))  # you need to know (height, width) or read header
+
+        # As you mentioned "np.array(OpenEXR.File(depth_file).parts[0].channels['Z'].pixels)",
+        # let's keep it consistent with your original snippet:
+        depth = np.array(OpenEXR.InputFile(depth_file).parts[0].channels["Z"].pixels)
+
+        # --- Read Class Image (for Sky Mask) ---
+        class_img = cv2.imread(class_file, cv2.IMREAD_UNCHANGED)
+        # Create sky mask
+        sky_mask = (class_img == sky_id).astype(np.uint8) * 255
+
+        # --- Read Meta Data (for Camera Intrinsics) ---
+        with open(meta_file, "r") as f:
+            cam_info = json.load(f)["camera"]
+            intrinsic = cam_info["intrinsic"]
+            fx, fy, cx, cy = (
+                intrinsic["fx"],
+                intrinsic["fy"],
+                intrinsic["u0"],
+                intrinsic["v0"],
+            )
+
+        K = np.eye(3, dtype=np.float32)
+        K[0, 0] = fx
+        K[1, 1] = fy
+        K[0, 2] = cx
+        K[1, 2] = cy
+
+        # --- Copy RGB ---
+        shutil.copy(rgb_file, out_img_path)
+
+        # --- Save Depth, Mask, and Intrinsics ---
+        np.save(out_depth_path, depth)
+        cv2.imwrite(out_mask_path, sky_mask)
+        np.savez(out_cam_path, intrinsics=K)
+
+    except Exception as e:
+        return f"Error processing {basename}: {e}"
+
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Preprocess Synscapes data.")
+    parser.add_argument(
+        "--synscapes_dir",
+        required=True,
+        help="Path to the main Synscapes directory (contains 'img' and 'meta' folders).",
+    )
+    parser.add_argument(
+        "--output_dir",
+        required=True,
+        help="Path to the output directory for processed data.",
+    )
+    parser.add_argument(
+        "--sky_id",
+        type=int,
+        default=23,
+        help="Class ID for sky pixels in class .png. Default is 23.",
+    )
+    args = parser.parse_args()
+
+    synscapes_dir = os.path.abspath(args.synscapes_dir)
+    output_dir = os.path.abspath(args.output_dir)
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Define input subdirectories
+    rgb_dir = os.path.join(synscapes_dir, "img", "rgb")
+    depth_dir = os.path.join(synscapes_dir, "img", "depth")
+    class_dir = os.path.join(synscapes_dir, "img", "class")
+    meta_dir = os.path.join(synscapes_dir, "meta")
+
+    # Define output subdirectories
+    out_rgb_dir = os.path.join(output_dir, "rgb")
+    out_depth_dir = os.path.join(output_dir, "depth")
+    out_mask_dir = os.path.join(output_dir, "sky_mask")
+    out_cam_dir = os.path.join(output_dir, "cam")
+    for d in [out_rgb_dir, out_depth_dir, out_mask_dir, out_cam_dir]:
+        os.makedirs(d, exist_ok=True)
+
+    # Collect all EXR depth filenames (excluding extension)
+    basenames = sorted(
+        [
+            os.path.splitext(fname)[0]
+            for fname in os.listdir(depth_dir)
+            if fname.endswith(".exr")
+        ]
+    )
+
+    # Parallel processing
+    from concurrent.futures import ProcessPoolExecutor, as_completed
+
+    num_workers = max(1, os.cpu_count() // 2)
+
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        future_to_basename = {
+            executor.submit(
+                process_basename,
+                bname,
+                rgb_dir,
+                depth_dir,
+                class_dir,
+                meta_dir,
+                out_rgb_dir,
+                out_depth_dir,
+                out_mask_dir,
+                out_cam_dir,
+                args.sky_id,
+            ): bname
+            for bname in basenames
+        }
+
+        for future in tqdm(
+            as_completed(future_to_basename),
+            total=len(future_to_basename),
+            desc="Processing Synscapes",
+        ):
+            basename = future_to_basename[future]
+            error = future.result()
+            if error:
+                print(error)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_tartanair.py b/extern/CUT3R/datasets_preprocess/preprocess_tartanair.py
new file mode 100644
index 0000000000000000000000000000000000000000..c726ef9b4f371e4310d9710b292cd6a5c33f0bd9
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_tartanair.py
@@ -0,0 +1,115 @@
+import argparse
+import random
+import gzip
+import json
+import os
+import os.path as osp
+import torch
+import PIL.Image
+from PIL import Image
+import numpy as np
+import cv2
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import shutil
+import src.dust3r.datasets.utils.cropping as cropping  # noqa
+from scipy.spatial.transform import Rotation as R
+
+
+def get_parser():
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tartanair_dir",
+        default="data/tartanair",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="data/mast3r_data/processed_tartanair",
+    )
+    return parser
+
+
+def main(rootdir, outdir):
+    os.makedirs(outdir, exist_ok=True)
+    envs = [
+        f for f in sorted(os.listdir(rootdir)) if os.path.isdir(osp.join(rootdir, f))
+    ]
+    for env in tqdm(envs):
+        for difficulty in ["Easy", "Hard"]:
+            subscenes = [
+                f
+                for f in os.listdir(osp.join(rootdir, env, difficulty))
+                if os.path.isdir(osp.join(rootdir, env, difficulty, f))
+            ]
+            for subscene in tqdm(subscenes):
+                frame_dir = osp.join(rootdir, env, difficulty, subscene)
+                rgb_dir = osp.join(frame_dir, "image_left")
+                depth_dir = osp.join(frame_dir, "depth_left")
+                flow_dir = osp.join(frame_dir, "flow")
+                intrinsics = np.array(
+                    [[320.0, 0.0, 320.0], [0.0, 320.0, 240.0], [0.0, 0.0, 1.0]]
+                ).astype(np.float32)
+                poses = np.loadtxt(osp.join(frame_dir, "pose_left.txt"))
+                frame_num = len(poses)
+                os.makedirs(osp.join(outdir, env, difficulty, subscene), exist_ok=True)
+                assert (
+                    len(os.listdir(rgb_dir))
+                    == len(os.listdir(depth_dir))
+                    == len(os.listdir(flow_dir)) // 2 + 1
+                    == frame_num
+                )
+                for i in tqdm(range(frame_num)):
+                    rgb_path = osp.join(rgb_dir, f"{i:06d}_left.png")
+                    out_rgb_path = osp.join(
+                        outdir, env, difficulty, subscene, f"{i:06d}_rgb.png"
+                    )
+                    depth_path = osp.join(depth_dir, f"{i:06d}_left_depth.npy")
+                    out_depth_path = osp.join(
+                        outdir, env, difficulty, subscene, f"{i:06d}_depth.npy"
+                    )
+                    if i < frame_num - 1:
+                        fflow_path = osp.join(flow_dir, f"{i:06d}_{i+1:06d}_flow.npy")
+                        mask_path = osp.join(flow_dir, f"{i:06d}_{i+1:06d}_mask.npy")
+                    else:
+                        fflow_path = None
+                        mask_path = None
+                    out_fflow_path = (
+                        osp.join(outdir, env, difficulty, subscene, f"{i:06d}_flow.npy")
+                        if fflow_path is not None
+                        else None
+                    )
+                    out_mask_path = (
+                        osp.join(outdir, env, difficulty, subscene, f"{i:06d}_mask.npy")
+                        if mask_path is not None
+                        else None
+                    )
+                    pose = poses[i]
+                    x, y, z, qx, qy, qz, qw = pose
+                    rotation = R.from_quat([qx, qy, qz, qw]).as_matrix()
+                    c2w = np.eye(4)
+                    c2w[:3, :3] = rotation
+                    c2w[:3, 3] = [x, y, z]
+                    w2c = np.linalg.inv(c2w)
+                    w2c = w2c[[1, 2, 0, 3]]
+                    c2w = np.linalg.inv(w2c)
+                    K = intrinsics
+                    # copy
+                    shutil.copy(rgb_path, out_rgb_path)
+                    shutil.copy(depth_path, out_depth_path)
+                    if fflow_path is not None:
+                        shutil.copy(fflow_path, out_fflow_path)
+                    if mask_path is not None:
+                        shutil.copy(mask_path, out_mask_path)
+                    np.savez(
+                        osp.join(outdir, env, difficulty, subscene, f"{i:06d}_cam.npz"),
+                        camera_pose=c2w.astype(np.float32),
+                        camera_intrinsics=K.astype(np.float32),
+                    )
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.tartanair_dir, args.output_dir)
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_uasol.py b/extern/CUT3R/datasets_preprocess/preprocess_uasol.py
new file mode 100644
index 0000000000000000000000000000000000000000..16126d3cca420bf4e9a8bd535d349151868d83ea
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_uasol.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+"""
+Preprocess Script for UASOL Dataset
+
+This script processes sequences in the UASOL dataset by:
+  - Parsing camera parameters from a 'log.txt' file.
+  - Reading a 'complete.json' manifest that describes frames (RGB + depth).
+  - Converting depth from millimeters to meters.
+  - Rescaling images and depth maps to a fixed resolution (default 640x480).
+  - Saving the camera intrinsics and pose in .npz files.
+
+Usage:
+    python preprocess_uasol.py \
+        --input_dir /path/to/data_uasol \
+        --output_dir /path/to/processed_uasol
+"""
+
+import os
+import json
+import numpy as np
+import cv2
+from PIL import Image
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import argparse
+
+import src.dust3r.datasets.utils.cropping as cropping
+
+
+def parse_log_file(log_file):
+    """
+    Parses the log.txt file and returns a dictionary of camera parameters.
+
+    Args:
+        log_file (str): Path to the log.txt file containing camera parameters.
+
+    Returns:
+        dict: A dictionary of camera parameters parsed from the file.
+    """
+    camera_dict = {}
+    start_parse = False
+    with open(log_file, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line.startswith("LEFT CAMERA PARAMETERS"):
+                start_parse = True
+                continue
+            if start_parse and ":" in line:
+                key, value = line.split(":", 1)
+                key = key.strip().replace(" ", "_").lower()
+                value = value.strip().strip(".")
+                # Handle numeric/list values
+                if "," in value or "[" in value:
+                    # Convert to list of floats
+                    value = [float(v.strip()) for v in value.strip("[]").split(",")]
+                else:
+                    try:
+                        value = float(value)
+                    except ValueError:
+                        pass
+                camera_dict[key] = value
+    return camera_dict
+
+
+def process_data(task_args):
+    """
+    Process a single frame of the dataset:
+      - Reads the RGB image and depth map.
+      - Converts depth from mm to meters.
+      - Rescales the image and depth to a fixed output resolution.
+      - Saves results (RGB, depth, camera intrinsics, and pose).
+
+    Args:
+        task_args (tuple): A tuple containing:
+            - data (dict): Frame info from 'complete.json'.
+            - seq_dir (str): Path to the sequence directory.
+            - out_rgb_dir (str): Output directory for RGB images.
+            - out_depth_dir (str): Output directory for depth maps.
+            - out_cam_dir (str): Output directory for camera intrinsics/pose.
+            - K (np.ndarray): 3x3 camera intrinsics matrix.
+            - H (int): Original image height.
+            - W (int): Original image width.
+
+    Returns:
+        str or None:
+            Returns an error message (str) if something goes wrong.
+            Otherwise, returns None on success.
+    """
+    data, seq_dir, out_rgb_dir, out_depth_dir, out_cam_dir, K, H, W = task_args
+    try:
+        img_p = data["color_frame_left"]
+        depth_p = data["depth_frame"]
+        matrix = data["m"]
+
+        # Input file paths
+        img_path = os.path.join(seq_dir, "Images", img_p + ".png")
+        depth_path = os.path.join(seq_dir, "Images", depth_p + ".png")
+
+        if not (os.path.isfile(img_path) and os.path.isfile(depth_path)):
+            return f"Missing files for {img_p}"
+
+        # Read RGB
+        img = Image.open(img_path).convert("RGB")
+
+        # Read depth (16-bit or 32-bit), then convert mm to meters
+        depth = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH).astype(np.float32)
+        if depth.shape[0] != H or depth.shape[1] != W:
+            return f"Depth shape mismatch for {img_p}"
+        depth = depth / 1000.0  # mm to meters
+
+        # Build the pose matrix
+        pose = np.array(matrix, dtype=np.float32)
+        # Convert translation (last column) from mm to meters
+        pose[:3, 3] /= 1000.0
+
+        # Rescale image and depth to desired output size (e.g., 640x480)
+        image, depthmap, camera_intrinsics = cropping.rescale_image_depthmap(
+            img, depth, K, output_resolution=(640, 480)
+        )
+
+        # Save outputs
+        out_img_path = os.path.join(out_rgb_dir, img_p + ".png")
+        out_depth_path = os.path.join(out_depth_dir, img_p + ".npy")
+        out_cam_path = os.path.join(out_cam_dir, img_p + ".npz")
+
+        image.save(out_img_path)
+        np.save(out_depth_path, depthmap)
+        np.savez(out_cam_path, intrinsics=camera_intrinsics, pose=pose)
+
+    except Exception as e:
+        return f"Error processing {img_p}: {e}"
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Preprocess UASOL dataset.")
+    parser.add_argument(
+        "--input_dir", required=True, help="Path to the root UASOL directory."
+    )
+    parser.add_argument(
+        "--output_dir",
+        required=True,
+        help="Path to the directory where processed data will be stored.",
+    )
+    args = parser.parse_args()
+
+    root = os.path.abspath(args.input_dir)
+    out_dir = os.path.abspath(args.output_dir)
+    os.makedirs(out_dir, exist_ok=True)
+
+    # Find all sequences that have a 'Images' folder
+    seqs = []
+    for d in os.listdir(root):
+        images_path = os.path.join(root, d, "Images")
+        if os.path.isdir(images_path):
+            seqs.append(d)
+
+    for seq in seqs:
+        seq_dir = os.path.join(root, seq)
+        log_file = os.path.join(seq_dir, "log.txt")
+        manifest_file = os.path.join(seq_dir, "complete.json")
+
+        # Create output subdirectories
+        out_rgb_dir = os.path.join(out_dir, seq, "rgb")
+        out_depth_dir = os.path.join(out_dir, seq, "depth")
+        out_cam_dir = os.path.join(out_dir, seq, "cam")
+        os.makedirs(out_rgb_dir, exist_ok=True)
+        os.makedirs(out_depth_dir, exist_ok=True)
+        os.makedirs(out_cam_dir, exist_ok=True)
+
+        # Parse camera parameters from log.txt
+        camera_dict = parse_log_file(log_file)
+
+        # Extract relevant camera info
+        cx = camera_dict["optical_center_along_x_axis,_defined_in_pixels"]
+        cy = camera_dict["optical_center_along_y_axis,_defined_in_pixels"]
+        fx = camera_dict["focal_length_in_pixels_alog_x_axis"]
+        fy = camera_dict["focal_length_in_pixels_alog_y_axis"]
+        W, H = map(int, camera_dict["resolution"])
+        # Optionally read any 'depth_min_and_max_range_values' if needed
+        # depth_range = camera_dict['depth_min_and_max_range_values']
+
+        # Construct intrinsic matrix
+        K = np.eye(3, dtype=np.float32)
+        K[0, 0] = fx
+        K[1, 1] = fy
+        K[0, 2] = cx
+        K[1, 2] = cy
+
+        # Read the JSON manifest
+        if not os.path.isfile(manifest_file):
+            print(
+                f"Warning: No manifest file found at {manifest_file}. Skipping {seq}."
+            )
+            continue
+
+        with open(manifest_file, "r") as f:
+            metadata = json.load(f)["Data"]
+
+        # Build tasks for parallel processing
+        tasks = []
+        for data in metadata:
+            tasks.append(
+                (data, seq_dir, out_rgb_dir, out_depth_dir, out_cam_dir, K, H, W)
+            )
+
+        # Process frames in parallel
+        with ProcessPoolExecutor(max_workers=os.cpu_count() or 4) as executor:
+            futures = {
+                executor.submit(process_data, t): t[0]["color_frame_left"]
+                for t in tasks
+            }
+            for future in tqdm(
+                as_completed(futures), total=len(futures), desc=f"Processing {seq}"
+            ):
+                error = future.result()
+                if error:
+                    print(error)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_unreal4k.py b/extern/CUT3R/datasets_preprocess/preprocess_unreal4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..0af54cd2311db44261de2fb7d615fa98a4f5e302
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_unreal4k.py
@@ -0,0 +1,110 @@
+import argparse
+import random
+import gzip
+import json
+import os
+import os.path as osp
+import torch
+import PIL.Image
+from PIL import Image
+import numpy as np
+import cv2
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import shutil
+import src.dust3r.datasets.utils.cropping as cropping  # noqa
+from scipy.spatial.transform import Rotation as R
+
+
+def get_parser():
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--unreal4k_dir",
+        default="",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="",
+    )
+    return parser
+
+
+def parse_extrinsics(file_path):
+    """
+    Parse the extrinsics file to extract the intrinsics and pose matrices.
+
+    Args:
+    file_path (str): The path to the file containing the extrinsics data.
+
+    Returns:
+    tuple: A tuple containing the intrinsics matrix (3x3) and pose matrix (3x4).
+    """
+    with open(file_path, "r") as file:
+        lines = file.readlines()
+
+        # Parse the intrinsics matrix
+        intrinsics_data = list(map(float, lines[0].strip().split()))
+        intrinsics_matrix = np.array(intrinsics_data).reshape(3, 3)
+
+        # Parse the pose matrix
+        cam2world = np.eye(4)
+        pose_data = list(map(float, lines[1].strip().split()))
+        pose_matrix = np.array(pose_data).reshape(3, 4)
+        cam2world[:3] = pose_matrix
+        cam2world = np.linalg.inv(cam2world)
+
+        return intrinsics_matrix, cam2world
+
+
+def main(rootdir, outdir):
+    os.makedirs(outdir, exist_ok=True)
+    envs = [
+        f for f in sorted(os.listdir(rootdir)) if os.path.isdir(osp.join(rootdir, f))
+    ]
+    for env in tqdm(envs):
+        subscenes = ["0", "1"]
+        for subscene in tqdm(subscenes):
+            frame_dir = osp.join(rootdir, env)
+            rgb_dir = osp.join(frame_dir, f"Image{subscene}")
+            disp_dir = osp.join(frame_dir, f"Disp{subscene}")
+            ext_dir = osp.join(frame_dir, f"Extrinsics{subscene}")
+
+            frame_num = len(os.listdir(rgb_dir))
+            os.makedirs(osp.join(outdir, env, subscene), exist_ok=True)
+            for i in tqdm(range(frame_num)):
+                rgb_path = osp.join(rgb_dir, f"{i:05d}.png")
+                out_rgb_path = osp.join(outdir, env, subscene, f"{i:05d}_rgb.png")
+                disp_path = osp.join(disp_dir, f"{i:05d}.npy")
+                out_depth_path = osp.join(outdir, env, subscene, f"{i:05d}_depth.npy")
+                out_cam_path = osp.join(outdir, env, subscene, f"{i:05d}.npz")
+                ext_path0 = osp.join(frame_dir, f"Extrinsics0", f"{i:05d}.txt")
+                ext_path1 = osp.join(frame_dir, f"Extrinsics1", f"{i:05d}.txt")
+                K0, c2w0 = parse_extrinsics(ext_path0)
+                K1, c2w1 = parse_extrinsics(ext_path1)
+                if subscene == "0":
+                    K = K0
+                    c2w = c2w0
+                else:
+                    K = K1
+                    c2w = c2w1
+
+                img = Image.open(rgb_path).convert("RGB")
+                disp = np.load(disp_path).astype(np.float32)
+                baseline = (np.linalg.inv(c2w0) @ c2w1)[0, 3]
+                depth = baseline * K[0, 0] / disp
+
+                image, depthmap, camera_intrinsics = cropping.rescale_image_depthmap(
+                    img, depth, K, output_resolution=(512, 384)
+                )
+
+                image.save(out_rgb_path)
+                np.save(out_depth_path, depthmap)
+                np.savez(out_cam_path, intrinsics=camera_intrinsics, cam2world=c2w)
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.unreal4k_dir, args.output_dir)
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_urbansyn.py b/extern/CUT3R/datasets_preprocess/preprocess_urbansyn.py
new file mode 100644
index 0000000000000000000000000000000000000000..073b9df32fec4d0fcf9a63ff53d39d4ae90860d6
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_urbansyn.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+"""
+Preprocess Script for UrbanSyn Dataset
+
+This script:
+  1. Reads RGB, depth (EXR), and semantic segmentation (class) files from an UrbanSyn dataset directory.
+  2. Retrieves camera intrinsics from a JSON metadata file.
+  3. Rescales images, depth maps, and masks to a fixed resolution (e.g., 640×480).
+  4. Saves processed data (RGB, .npy depth, .png sky mask, and .npz intrinsics) in an organized structure.
+
+Usage:
+    python preprocess_urbansyn.py \
+        --input_dir /path/to/data_urbansyn \
+        --output_dir /path/to/processed_urbansyn
+"""
+
+import os
+import json
+import argparse
+import shutil
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import cv2
+import numpy as np
+from tqdm import tqdm
+from PIL import Image
+
+# Make sure OpenCV EXR support is enabled
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+
+# Custom "cropping" module (ensure cropping.py is available/installed)
+import cropping
+
+
+def process_basename(
+    basename,
+    rgb_dir,
+    depth_dir,
+    class_dir,
+    cam_info,
+    out_rgb_dir,
+    out_depth_dir,
+    out_mask_dir,
+    out_cam_dir,
+):
+    """
+    Process a single file triplet (RGB, depth, class) for a given basename.
+
+    Args:
+        basename (str): Base name without file extension (e.g., 'image_0001').
+        rgb_dir (str): Directory containing RGB .png files.
+        depth_dir (str): Directory containing .exr depth files.
+        class_dir (str): Directory containing class .png files (semantic segmentation).
+        cam_info (dict): Dictionary with camera metadata (focal length, sensor size).
+        out_rgb_dir (str): Output directory for rescaled RGB images.
+        out_depth_dir (str): Output directory for rescaled depth files.
+        out_mask_dir (str): Output directory for sky masks.
+        out_cam_dir (str): Output directory for camera intrinsics.
+
+    Returns:
+        str or None:
+            - Returns None if successful.
+            - Returns an error message if something fails.
+    """
+
+    # Construct output file paths
+    out_img_path = os.path.join(out_rgb_dir, f"{basename}.png")
+    out_depth_path = os.path.join(out_depth_dir, f"{basename}.npy")
+    out_mask_path = os.path.join(out_mask_dir, f"{basename}.png")
+    out_cam_path = os.path.join(out_cam_dir, f"{basename}.npz")
+
+    # Skip if already processed
+    if (
+        os.path.exists(out_img_path)
+        and os.path.exists(out_depth_path)
+        and os.path.exists(out_mask_path)
+        and os.path.exists(out_cam_path)
+    ):
+        return None
+
+    try:
+        # Build file paths
+        img_file = os.path.join(rgb_dir, f"{basename}.png")
+        depth_file = os.path.join(depth_dir, f'{basename.replace("rgb", "depth")}.exr')
+        class_file = os.path.join(class_dir, basename.replace("rgb", "ss") + ".png")
+
+        # 1. Read RGB image
+        img = cv2.imread(img_file, cv2.IMREAD_UNCHANGED)
+        if img is None:
+            return f"Error: Could not read image file {img_file}"
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR -> RGB
+        H, W = img.shape[:2]
+
+        # 2. Read depth from EXR
+        depth = cv2.imread(depth_file, cv2.IMREAD_UNCHANGED)
+        if depth is None:
+            # Attempt fallback if there's a '.exr.1' file
+            alt_exr_1 = depth_file + ".1"
+            if os.path.exists(alt_exr_1):
+                temp_exr = depth_file.replace(".exr", "_tmp.exr")
+                os.rename(alt_exr_1, temp_exr)
+                depth = cv2.imread(temp_exr, cv2.IMREAD_UNCHANGED)
+                if depth is None:
+                    return f"Error reading depth file (fallback) {temp_exr}"
+                depth *= 1e5
+            else:
+                return f"Error reading depth file {depth_file}"
+        else:
+            depth *= 1e5  # multiply by 1e5, consistent with your original code
+
+        # 3. Read class image, build sky mask
+        cl = cv2.imread(class_file, cv2.IMREAD_UNCHANGED)
+        if cl is None:
+            return f"Error: Could not read class file {class_file}"
+        sky_mask = (cl[..., 0] == 10).astype(np.uint8)  # class ID 10 => sky
+
+        # 4. Build camera intrinsics
+        f_mm = cam_info["focalLength_mm"]
+        w_mm = cam_info["sensorWidth_mm"]
+        h_mm = cam_info["sensorHeight_mm"]
+        K = np.eye(3, dtype=np.float32)
+        K[0, 0] = f_mm / w_mm * W
+        K[1, 1] = f_mm / h_mm * H
+        K[0, 2] = W / 2
+        K[1, 2] = H / 2
+
+        # 5. Combine depth + sky_mask in a single array for rescaling
+        depth_with_mask = np.stack([depth, sky_mask], axis=-1)
+
+        # 6. Rescale to desired size
+        image_pil = Image.fromarray(img)
+        image_rescaled, depth_with_mask_rescaled, K_rescaled = (
+            cropping.rescale_image_depthmap(
+                image_pil, depth_with_mask, K, output_resolution=(640, 480)
+            )
+        )
+
+        # Write outputs
+        image_rescaled.save(out_img_path)
+        np.save(out_depth_path, depth_with_mask_rescaled[..., 0])
+        cv2.imwrite(
+            out_mask_path, (depth_with_mask_rescaled[..., 1] * 255).astype(np.uint8)
+        )
+        np.savez(out_cam_path, intrinsics=K_rescaled)
+
+    except Exception as e:
+        return f"Error processing {basename}: {e}"
+
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Preprocess UrbanSyn dataset by loading RGB/Depth/Seg "
+        "and rescaling them with camera intrinsics."
+    )
+    parser.add_argument(
+        "--input_dir", required=True, help="Path to the UrbanSyn dataset directory."
+    )
+    parser.add_argument(
+        "--output_dir",
+        required=True,
+        help="Path to the directory where processed data will be stored.",
+    )
+    args = parser.parse_args()
+
+    input_dir = os.path.abspath(args.input_dir)
+    output_dir = os.path.abspath(args.output_dir)
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Define input subdirectories
+    rgb_dir = os.path.join(input_dir, "rgb")
+    depth_dir = os.path.join(input_dir, "depth")
+    class_dir = os.path.join(input_dir, "ss")
+    meta_file = os.path.join(input_dir, "camera_metadata.json")
+
+    # Define output subdirectories
+    out_rgb_dir = os.path.join(output_dir, "rgb")
+    out_depth_dir = os.path.join(output_dir, "depth")
+    out_mask_dir = os.path.join(output_dir, "sky_mask")
+    out_cam_dir = os.path.join(output_dir, "cam")
+    for d in [out_rgb_dir, out_depth_dir, out_mask_dir, out_cam_dir]:
+        os.makedirs(d, exist_ok=True)
+
+    # Gather basenames from RGB files
+    basenames = sorted(
+        [
+            os.path.splitext(fname)[0]
+            for fname in os.listdir(rgb_dir)
+            if fname.endswith(".png")
+        ]
+    )
+    if not basenames:
+        print(f"No RGB .png files found in {rgb_dir}. Exiting.")
+        return
+
+    # Load camera metadata
+    if not os.path.isfile(meta_file):
+        print(f"Error: metadata file not found at {meta_file}. Exiting.")
+        return
+
+    with open(meta_file, "r") as f:
+        cam_info_full = json.load(f)
+        cam_info = cam_info_full["parameters"][0]["Camera"]
+
+    # Process in parallel
+    num_workers = max(1, os.cpu_count() or 1)
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        futures = {
+            executor.submit(
+                process_basename,
+                basename,
+                rgb_dir,
+                depth_dir,
+                class_dir,
+                cam_info,
+                out_rgb_dir,
+                out_depth_dir,
+                out_mask_dir,
+                out_cam_dir,
+            ): basename
+            for basename in basenames
+        }
+
+        # Use tqdm for progress
+        for future in tqdm(
+            as_completed(futures), total=len(futures), desc="Processing UrbanSyn"
+        ):
+            error = future.result()
+            if error:
+                print(error)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_waymo.py b/extern/CUT3R/datasets_preprocess/preprocess_waymo.py
new file mode 100644
index 0000000000000000000000000000000000000000..6130bc932c0dd6fe33110f40d02fb2ff3b9e3552
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_waymo.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Preprocessing code for the WayMo Open dataset
+# dataset at https://github.com/waymo-research/waymo-open-dataset
+# 1) Accept the license
+# 2) download all training/*.tfrecord files from Perception Dataset, version 1.4.2
+# 3) put all .tfrecord files in '/path/to/waymo_dir'
+# 4) install the waymo_open_dataset package with
+#    `python3 -m pip install gcsfs waymo-open-dataset-tf-2-12-0==1.6.4`
+# 5) execute this script as `python preprocess_waymo.py --waymo_dir /path/to/waymo_dir`
+# --------------------------------------------------------
+import sys
+import os
+import os.path as osp
+import shutil
+import json
+from tqdm import tqdm
+import PIL.Image
+import numpy as np
+
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2
+
+import tensorflow.compat.v1 as tf
+
+tf.enable_eager_execution()
+
+import path_to_root  # noqa
+from src.dust3r.utils.geometry import geotrf, inv
+from src.dust3r.utils.image import imread_cv2
+from src.dust3r.utils.parallel import parallel_processes as parallel_map
+from datasets_preprocess.utils import cropping
+from src.dust3r.viz import show_raw_pointcloud
+
+
+def get_parser():
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--waymo_dir", required=True)
+    parser.add_argument("--precomputed_pairs", required=True)
+    parser.add_argument("--output_dir", default="data/waymo_processed")
+    parser.add_argument("--workers", type=int, default=1)
+    return parser
+
+
+def main(waymo_root, pairs_path, output_dir, workers=1):
+    extract_frames(waymo_root, output_dir, workers=workers)
+    make_crops(output_dir, workers=args.workers)
+
+    # make sure all pairs are there
+    with np.load(pairs_path) as data:
+        scenes = data["scenes"]
+        frames = data["frames"]
+        pairs = data["pairs"]  # (array of (scene_id, img1_id, img2_id)
+
+    for scene_id, im1_id, im2_id in pairs:
+        for im_id in (im1_id, im2_id):
+            path = osp.join(output_dir, scenes[scene_id], frames[im_id] + ".jpg")
+            assert osp.isfile(
+                path
+            ), f"Missing a file at {path=}\nDid you download all .tfrecord files?"
+
+    shutil.rmtree(osp.join(output_dir, "tmp"))
+    print("Done! all data generated at", output_dir)
+
+
+def _list_sequences(db_root):
+    print(">> Looking for sequences in", db_root)
+    res = sorted(f for f in os.listdir(db_root) if f.endswith(".tfrecord"))
+    print(f"    found {len(res)} sequences")
+    return res
+
+
+def extract_frames(db_root, output_dir, workers=8):
+    sequences = _list_sequences(db_root)
+    output_dir = osp.join(output_dir, "tmp")
+    print(">> outputing result to", output_dir)
+    args = [(db_root, output_dir, seq) for seq in sequences]
+    parallel_map(process_one_seq, args, star_args=True, workers=workers)
+
+
+def process_one_seq(db_root, output_dir, seq):
+    out_dir = osp.join(output_dir, seq)
+    os.makedirs(out_dir, exist_ok=True)
+    calib_path = osp.join(out_dir, "calib.json")
+    if osp.isfile(calib_path):
+        return
+
+    try:
+        with tf.device("/CPU:0"):
+            calib, frames = extract_frames_one_seq(osp.join(db_root, seq))
+    except RuntimeError:
+        print(f"/!\\ Error with sequence {seq} /!\\", file=sys.stderr)
+        return  # nothing is saved
+
+    for f, (frame_name, views) in enumerate(tqdm(frames, leave=False)):
+        for cam_idx, view in views.items():
+            img = PIL.Image.fromarray(view.pop("img"))
+            img.save(osp.join(out_dir, f"{f:05d}_{cam_idx}.jpg"))
+            np.savez(osp.join(out_dir, f"{f:05d}_{cam_idx}.npz"), **view)
+
+    with open(calib_path, "w") as f:
+        json.dump(calib, f)
+
+
+def extract_frames_one_seq(filename):
+    from waymo_open_dataset import dataset_pb2 as open_dataset
+    from waymo_open_dataset.utils import frame_utils
+
+    print(">> Opening", filename)
+    dataset = tf.data.TFRecordDataset(filename, compression_type="")
+
+    calib = None
+    frames = []
+
+    for data in tqdm(dataset, leave=False):
+        frame = open_dataset.Frame()
+        frame.ParseFromString(bytearray(data.numpy()))
+
+        content = frame_utils.parse_range_image_and_camera_projection(frame)
+        range_images, camera_projections, _, range_image_top_pose = content
+
+        views = {}
+        frames.append((frame.context.name, views))
+
+        # once in a sequence, read camera calibration info
+        if calib is None:
+            calib = []
+            for cam in frame.context.camera_calibrations:
+                calib.append(
+                    (
+                        cam.name,
+                        dict(
+                            width=cam.width,
+                            height=cam.height,
+                            intrinsics=list(cam.intrinsic),
+                            extrinsics=list(cam.extrinsic.transform),
+                        ),
+                    )
+                )
+
+        # convert LIDAR to pointcloud
+        points, cp_points = frame_utils.convert_range_image_to_point_cloud(
+            frame, range_images, camera_projections, range_image_top_pose
+        )
+
+        # 3d points in vehicle frame.
+        points_all = np.concatenate(points, axis=0)
+        cp_points_all = np.concatenate(cp_points, axis=0)
+
+        # The distance between lidar points and vehicle frame origin.
+        cp_points_all_tensor = tf.constant(cp_points_all, dtype=tf.int32)
+
+        for i, image in enumerate(frame.images):
+            # select relevant 3D points for this view
+            mask = tf.equal(cp_points_all_tensor[..., 0], image.name)
+            cp_points_msk_tensor = tf.cast(
+                tf.gather_nd(cp_points_all_tensor, tf.where(mask)), dtype=tf.float32
+            )
+
+            pose = np.asarray(image.pose.transform).reshape(4, 4)
+            timestamp = image.pose_timestamp
+
+            rgb = tf.image.decode_jpeg(image.image).numpy()
+
+            pix = cp_points_msk_tensor[..., 1:3].numpy().round().astype(np.int16)
+            pts3d = points_all[mask.numpy()]
+
+            views[image.name] = dict(
+                img=rgb, pose=pose, pixels=pix, pts3d=pts3d, timestamp=timestamp
+            )
+
+        if not "show full point cloud":
+            show_raw_pointcloud(
+                [v["pts3d"] for v in views.values()], [v["img"] for v in views.values()]
+            )
+
+    return calib, frames
+
+
+def make_crops(output_dir, workers=16, **kw):
+    tmp_dir = osp.join(output_dir, "tmp")
+    sequences = _list_sequences(tmp_dir)
+    args = [(tmp_dir, output_dir, seq) for seq in sequences]
+    parallel_map(crop_one_seq, args, star_args=True, workers=workers, front_num=0)
+
+
+def crop_one_seq(input_dir, output_dir, seq, resolution=512):
+    seq_dir = osp.join(input_dir, seq)
+    out_dir = osp.join(output_dir, seq)
+    if osp.isfile(osp.join(out_dir, "00100_1.jpg")):
+        return
+    os.makedirs(out_dir, exist_ok=True)
+
+    # load calibration file
+    try:
+        with open(osp.join(seq_dir, "calib.json")) as f:
+            calib = json.load(f)
+    except IOError:
+        print(f"/!\\ Error: Missing calib.json in sequence {seq} /!\\", file=sys.stderr)
+        return
+
+    axes_transformation = np.array(
+        [[0, -1, 0, 0], [0, 0, -1, 0], [1, 0, 0, 0], [0, 0, 0, 1]]
+    )
+
+    cam_K = {}
+    cam_distortion = {}
+    cam_res = {}
+    cam_to_car = {}
+    for cam_idx, cam_info in calib:
+        cam_idx = str(cam_idx)
+        cam_res[cam_idx] = (W, H) = (cam_info["width"], cam_info["height"])
+        f1, f2, cx, cy, k1, k2, p1, p2, k3 = cam_info["intrinsics"]
+        cam_K[cam_idx] = np.asarray([(f1, 0, cx), (0, f2, cy), (0, 0, 1)])
+        cam_distortion[cam_idx] = np.asarray([k1, k2, p1, p2, k3])
+        cam_to_car[cam_idx] = np.asarray(cam_info["extrinsics"]).reshape(
+            4, 4
+        )  # cam-to-vehicle
+
+    frames = sorted(f[:-3] for f in os.listdir(seq_dir) if f.endswith(".jpg"))
+
+    # from dust3r.viz import SceneViz
+    # viz = SceneViz()
+
+    for frame in tqdm(frames, leave=False):
+        cam_idx = frame[-2]  # cam index
+        assert cam_idx in "12345", f"bad {cam_idx=} in {frame=}"
+        data = np.load(osp.join(seq_dir, frame + "npz"))
+        car_to_world = data["pose"]
+        W, H = cam_res[cam_idx]
+
+        # load depthmap
+        pos2d = data["pixels"].round().astype(np.uint16)
+        x, y = pos2d.T
+        pts3d = data["pts3d"]  # already in the car frame
+        pts3d = geotrf(axes_transformation @ inv(cam_to_car[cam_idx]), pts3d)
+        # X=LEFT_RIGHT y=ALTITUDE z=DEPTH
+
+        # load image
+        image = imread_cv2(osp.join(seq_dir, frame + "jpg"))
+
+        # downscale image
+        output_resolution = (resolution, 1) if W > H else (1, resolution)
+        image, _, intrinsics2 = cropping.rescale_image_depthmap(
+            image, None, cam_K[cam_idx], output_resolution
+        )
+        image.save(osp.join(out_dir, frame + "jpg"), quality=80)
+
+        # save as an EXR file? yes it's smaller (and easier to load)
+        W, H = image.size
+        depthmap = np.zeros((H, W), dtype=np.float32)
+        pos2d = (
+            geotrf(intrinsics2 @ inv(cam_K[cam_idx]), pos2d).round().astype(np.int16)
+        )
+        x, y = pos2d.T
+        depthmap[y.clip(min=0, max=H - 1), x.clip(min=0, max=W - 1)] = pts3d[:, 2]
+        cv2.imwrite(osp.join(out_dir, frame + "exr"), depthmap)
+
+        # save camera parametes
+        cam2world = car_to_world @ cam_to_car[cam_idx] @ inv(axes_transformation)
+        np.savez(
+            osp.join(out_dir, frame + "npz"),
+            intrinsics=intrinsics2,
+            cam2world=cam2world,
+            distortion=cam_distortion[cam_idx],
+        )
+
+        # viz.add_rgbd(np.asarray(image), depthmap, intrinsics2, cam2world)
+    # viz.show()
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.waymo_dir, args.precomputed_pairs, args.output_dir, workers=args.workers)
diff --git a/extern/CUT3R/datasets_preprocess/preprocess_wildrgbd.py b/extern/CUT3R/datasets_preprocess/preprocess_wildrgbd.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccd0fc3a5738aedaccb22b7a6999fe010f74d4fc
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/preprocess_wildrgbd.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Script to pre-process the WildRGB-D dataset.
+# Usage:
+# python3 datasets_preprocess/preprocess_wildrgbd.py --wildrgbd_dir /path/to/wildrgbd
+# --------------------------------------------------------
+
+import argparse
+import random
+import json
+import os
+import os.path as osp
+
+import PIL.Image
+import numpy as np
+import cv2
+
+from tqdm.auto import tqdm
+import matplotlib.pyplot as plt
+
+import path_to_root  # noqa
+import datasets_preprocess.utils.cropping as cropping  # noqa
+from dust3r.utils.image import imread_cv2
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_dir", type=str, default="data/processed_wildrgbd")
+    parser.add_argument("--wildrgbd_dir", type=str, required=True)
+    parser.add_argument("--train_num_sequences_per_object", type=int, default=50)
+    parser.add_argument("--test_num_sequences_per_object", type=int, default=10)
+    parser.add_argument("--num_frames", type=int, default=100)
+    parser.add_argument("--seed", type=int, default=42)
+
+    parser.add_argument(
+        "--img_size",
+        type=int,
+        default=512,
+        help=(
+            "lower dimension will be >= img_size * 3/4, and max dimension will be >= img_size"
+        ),
+    )
+    return parser
+
+
+def get_set_list(category_dir, split):
+    listfiles = ["camera_eval_list.json", "nvs_list.json"]
+
+    sequences_all = {s: {k: set() for k in listfiles} for s in ["train", "val"]}
+    for listfile in listfiles:
+        with open(osp.join(category_dir, listfile)) as f:
+            subset_lists_data = json.load(f)
+            for s in ["train", "val"]:
+                sequences_all[s][listfile].update(subset_lists_data[s])
+    train_intersection = set.intersection(*list(sequences_all["train"].values()))
+    if split == "train":
+        return train_intersection
+    else:
+        all_seqs = set.union(
+            *list(sequences_all["train"].values()), *list(sequences_all["val"].values())
+        )
+        return all_seqs.difference(train_intersection)
+
+
+def prepare_sequences(
+    category,
+    wildrgbd_dir,
+    output_dir,
+    img_size,
+    split,
+    max_num_sequences_per_object,
+    output_num_frames,
+    seed,
+):
+    random.seed(seed)
+    category_dir = osp.join(wildrgbd_dir, category)
+    category_output_dir = osp.join(output_dir, category)
+    sequences_all = get_set_list(category_dir, split)
+    sequences_all = sorted(sequences_all)
+
+    sequences_all_tmp = []
+    for seq_name in sequences_all:
+        scene_dir = osp.join(wildrgbd_dir, category_dir, seq_name)
+        if not os.path.isdir(scene_dir):
+            print(f"{scene_dir} does not exist, skipped")
+            continue
+        sequences_all_tmp.append(seq_name)
+    sequences_all = sequences_all_tmp
+    if len(sequences_all) <= max_num_sequences_per_object:
+        selected_sequences = sequences_all
+    else:
+        selected_sequences = random.sample(sequences_all, max_num_sequences_per_object)
+
+    selected_sequences_numbers_dict = {}
+    for seq_name in tqdm(selected_sequences, leave=False):
+        scene_dir = osp.join(category_dir, seq_name)
+        scene_output_dir = osp.join(category_output_dir, seq_name)
+        with open(osp.join(scene_dir, "metadata"), "r") as f:
+            metadata = json.load(f)
+
+        K = np.array(metadata["K"]).reshape(3, 3).T
+        fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
+        w, h = metadata["w"], metadata["h"]
+
+        camera_intrinsics = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
+        camera_to_world_path = os.path.join(scene_dir, "cam_poses.txt")
+        camera_to_world_content = np.genfromtxt(camera_to_world_path)
+        camera_to_world = camera_to_world_content[:, 1:].reshape(-1, 4, 4)
+
+        frame_idx = camera_to_world_content[:, 0]
+        num_frames = frame_idx.shape[0]
+        assert num_frames >= output_num_frames
+        assert np.all(frame_idx == np.arange(num_frames))
+
+        # selected_sequences_numbers_dict[seq_name] = num_frames
+
+        selected_frames = (
+            np.round(np.linspace(0, num_frames - 1, output_num_frames))
+            .astype(int)
+            .tolist()
+        )
+        selected_sequences_numbers_dict[seq_name] = selected_frames
+
+        for frame_id in tqdm(selected_frames):
+            depth_path = os.path.join(scene_dir, "depth", f"{frame_id:0>5d}.png")
+            masks_path = os.path.join(scene_dir, "masks", f"{frame_id:0>5d}.png")
+            rgb_path = os.path.join(scene_dir, "rgb", f"{frame_id:0>5d}.png")
+
+            input_rgb_image = PIL.Image.open(rgb_path).convert("RGB")
+            input_mask = plt.imread(masks_path)
+            input_depthmap = imread_cv2(depth_path, cv2.IMREAD_UNCHANGED).astype(
+                np.float64
+            )
+            depth_mask = np.stack((input_depthmap, input_mask), axis=-1)
+            H, W = input_depthmap.shape
+
+            min_margin_x = min(cx, W - cx)
+            min_margin_y = min(cy, H - cy)
+
+            # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
+            l, t = int(cx - min_margin_x), int(cy - min_margin_y)
+            r, b = int(cx + min_margin_x), int(cy + min_margin_y)
+            crop_bbox = (l, t, r, b)
+            input_rgb_image, depth_mask, input_camera_intrinsics = (
+                cropping.crop_image_depthmap(
+                    input_rgb_image, depth_mask, camera_intrinsics, crop_bbox
+                )
+            )
+
+            # try to set the lower dimension to img_size * 3/4 -> img_size=512 => 384
+            scale_final = ((img_size * 3 // 4) / min(H, W)) + 1e-8
+            output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
+            if max(output_resolution) < img_size:
+                # let's put the max dimension to img_size
+                scale_final = (img_size / max(H, W)) + 1e-8
+                output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
+
+            input_rgb_image, depth_mask, input_camera_intrinsics = (
+                cropping.rescale_image_depthmap(
+                    input_rgb_image,
+                    depth_mask,
+                    input_camera_intrinsics,
+                    output_resolution,
+                )
+            )
+            input_depthmap = depth_mask[:, :, 0]
+            input_mask = depth_mask[:, :, 1]
+
+            camera_pose = camera_to_world[frame_id]
+
+            # save crop images and depth, metadata
+            save_img_path = os.path.join(
+                scene_output_dir, "rgb", f"{frame_id:0>5d}.jpg"
+            )
+            save_depth_path = os.path.join(
+                scene_output_dir, "depth", f"{frame_id:0>5d}.png"
+            )
+            save_mask_path = os.path.join(
+                scene_output_dir, "masks", f"{frame_id:0>5d}.png"
+            )
+            os.makedirs(os.path.split(save_img_path)[0], exist_ok=True)
+            os.makedirs(os.path.split(save_depth_path)[0], exist_ok=True)
+            os.makedirs(os.path.split(save_mask_path)[0], exist_ok=True)
+
+            input_rgb_image.save(save_img_path)
+            cv2.imwrite(save_depth_path, input_depthmap.astype(np.uint16))
+            cv2.imwrite(save_mask_path, (input_mask * 255).astype(np.uint8))
+
+            save_meta_path = os.path.join(
+                scene_output_dir, "metadata", f"{frame_id:0>5d}.npz"
+            )
+            os.makedirs(os.path.split(save_meta_path)[0], exist_ok=True)
+            np.savez(
+                save_meta_path,
+                camera_intrinsics=input_camera_intrinsics,
+                camera_pose=camera_pose,
+            )
+
+    return selected_sequences_numbers_dict
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    assert args.wildrgbd_dir != args.output_dir
+
+    categories = sorted(
+        [
+            dirname
+            for dirname in os.listdir(args.wildrgbd_dir)
+            if os.path.isdir(os.path.join(args.wildrgbd_dir, dirname, "scenes"))
+        ]
+    )
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    splits_num_sequences_per_object = [
+        args.train_num_sequences_per_object,
+        args.test_num_sequences_per_object,
+    ]
+    for split, num_sequences_per_object in zip(
+        ["train", "test"], splits_num_sequences_per_object
+    ):
+        selected_sequences_path = os.path.join(
+            args.output_dir, f"selected_seqs_{split}.json"
+        )
+        if os.path.isfile(selected_sequences_path):
+            continue
+        all_selected_sequences = {}
+        for category in categories:
+            category_output_dir = osp.join(args.output_dir, category)
+            os.makedirs(category_output_dir, exist_ok=True)
+            category_selected_sequences_path = os.path.join(
+                category_output_dir, f"selected_seqs_{split}.json"
+            )
+            if os.path.isfile(category_selected_sequences_path):
+                with open(category_selected_sequences_path, "r") as fid:
+                    category_selected_sequences = json.load(fid)
+            else:
+                print(f"Processing {split} - category = {category}")
+                category_selected_sequences = prepare_sequences(
+                    category=category,
+                    wildrgbd_dir=args.wildrgbd_dir,
+                    output_dir=args.output_dir,
+                    img_size=args.img_size,
+                    split=split,
+                    max_num_sequences_per_object=num_sequences_per_object,
+                    output_num_frames=args.num_frames,
+                    seed=args.seed + int("category".encode("ascii").hex(), 16),
+                )
+                with open(category_selected_sequences_path, "w") as file:
+                    json.dump(category_selected_sequences, file)
+
+            all_selected_sequences[category] = category_selected_sequences
+        with open(selected_sequences_path, "w") as file:
+            json.dump(all_selected_sequences, file)
diff --git a/extern/CUT3R/datasets_preprocess/read_write_model.py b/extern/CUT3R/datasets_preprocess/read_write_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4818fdbcaec1c7c3dc8bca0a0a92ce61abdd526f
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/read_write_model.py
@@ -0,0 +1,622 @@
+# Copyright (c) 2023, ETH Zurich and UNC Chapel Hill.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#
+#     * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
+#       its contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+import os
+import collections
+import numpy as np
+import struct
+import argparse
+
+
+CameraModel = collections.namedtuple(
+    "CameraModel", ["model_id", "model_name", "num_params"]
+)
+Camera = collections.namedtuple("Camera", ["id", "model", "width", "height", "params"])
+BaseImage = collections.namedtuple(
+    "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"]
+)
+Point3D = collections.namedtuple(
+    "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"]
+)
+
+
+class Image(BaseImage):
+    def qvec2rotmat(self):
+        return qvec2rotmat(self.qvec)
+
+
+CAMERA_MODELS = {
+    CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
+    CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
+    CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
+    CameraModel(model_id=3, model_name="RADIAL", num_params=5),
+    CameraModel(model_id=4, model_name="OPENCV", num_params=8),
+    CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
+    CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
+    CameraModel(model_id=7, model_name="FOV", num_params=5),
+    CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
+    CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
+    CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12),
+}
+CAMERA_MODEL_IDS = dict(
+    [(camera_model.model_id, camera_model) for camera_model in CAMERA_MODELS]
+)
+CAMERA_MODEL_NAMES = dict(
+    [(camera_model.model_name, camera_model) for camera_model in CAMERA_MODELS]
+)
+
+
+def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"):
+    """Read and unpack the next bytes from a binary file.
+    :param fid:
+    :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
+    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
+    :param endian_character: Any of {@, =, <, >, !}
+    :return: Tuple of read and unpacked values.
+    """
+    data = fid.read(num_bytes)
+    return struct.unpack(endian_character + format_char_sequence, data)
+
+
+def write_next_bytes(fid, data, format_char_sequence, endian_character="<"):
+    """pack and write to a binary file.
+    :param fid:
+    :param data: data to send, if multiple elements are sent at the same time,
+    they should be encapsuled either in a list or a tuple
+    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
+    should be the same length as the data list or tuple
+    :param endian_character: Any of {@, =, <, >, !}
+    """
+    if isinstance(data, (list, tuple)):
+        bytes = struct.pack(endian_character + format_char_sequence, *data)
+    else:
+        bytes = struct.pack(endian_character + format_char_sequence, data)
+    fid.write(bytes)
+
+
+def read_cameras_text(path):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::WriteCamerasText(const std::string& path)
+        void Reconstruction::ReadCamerasText(const std::string& path)
+    """
+    cameras = {}
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                camera_id = int(elems[0])
+                model = elems[1]
+                width = int(elems[2])
+                height = int(elems[3])
+                params = np.array(tuple(map(float, elems[4:])))
+                cameras[camera_id] = Camera(
+                    id=camera_id,
+                    model=model,
+                    width=width,
+                    height=height,
+                    params=params,
+                )
+    return cameras
+
+
+def read_cameras_binary(path_to_model_file):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::WriteCamerasBinary(const std::string& path)
+        void Reconstruction::ReadCamerasBinary(const std::string& path)
+    """
+    cameras = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_cameras = read_next_bytes(fid, 8, "Q")[0]
+        for _ in range(num_cameras):
+            camera_properties = read_next_bytes(
+                fid, num_bytes=24, format_char_sequence="iiQQ"
+            )
+            camera_id = camera_properties[0]
+            model_id = camera_properties[1]
+            model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
+            width = camera_properties[2]
+            height = camera_properties[3]
+            num_params = CAMERA_MODEL_IDS[model_id].num_params
+            params = read_next_bytes(
+                fid,
+                num_bytes=8 * num_params,
+                format_char_sequence="d" * num_params,
+            )
+            cameras[camera_id] = Camera(
+                id=camera_id,
+                model=model_name,
+                width=width,
+                height=height,
+                params=np.array(params),
+            )
+        assert len(cameras) == num_cameras
+    return cameras
+
+
+def write_cameras_text(cameras, path):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::WriteCamerasText(const std::string& path)
+        void Reconstruction::ReadCamerasText(const std::string& path)
+    """
+    HEADER = (
+        "# Camera list with one line of data per camera:\n"
+        + "#   CAMERA_ID, MODEL, WIDTH, HEIGHT, PARAMS[]\n"
+        + "# Number of cameras: {}\n".format(len(cameras))
+    )
+    with open(path, "w") as fid:
+        fid.write(HEADER)
+        for _, cam in cameras.items():
+            to_write = [cam.id, cam.model, cam.width, cam.height, *cam.params]
+            line = " ".join([str(elem) for elem in to_write])
+            fid.write(line + "\n")
+
+
+def write_cameras_binary(cameras, path_to_model_file):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::WriteCamerasBinary(const std::string& path)
+        void Reconstruction::ReadCamerasBinary(const std::string& path)
+    """
+    with open(path_to_model_file, "wb") as fid:
+        write_next_bytes(fid, len(cameras), "Q")
+        for _, cam in cameras.items():
+            model_id = CAMERA_MODEL_NAMES[cam.model].model_id
+            camera_properties = [cam.id, model_id, cam.width, cam.height]
+            write_next_bytes(fid, camera_properties, "iiQQ")
+            for p in cam.params:
+                write_next_bytes(fid, float(p), "d")
+    return cameras
+
+
+def read_images_text(path):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadImagesText(const std::string& path)
+        void Reconstruction::WriteImagesText(const std::string& path)
+    """
+    images = {}
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                image_id = int(elems[0])
+                qvec = np.array(tuple(map(float, elems[1:5])))
+                tvec = np.array(tuple(map(float, elems[5:8])))
+                camera_id = int(elems[8])
+                image_name = elems[9]
+                elems = fid.readline().split()
+                xys = np.column_stack(
+                    [
+                        tuple(map(float, elems[0::3])),
+                        tuple(map(float, elems[1::3])),
+                    ]
+                )
+                point3D_ids = np.array(tuple(map(int, elems[2::3])))
+                images[image_id] = Image(
+                    id=image_id,
+                    qvec=qvec,
+                    tvec=tvec,
+                    camera_id=camera_id,
+                    name=image_name,
+                    xys=xys,
+                    point3D_ids=point3D_ids,
+                )
+    return images
+
+
+def read_images_binary(path_to_model_file):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadImagesBinary(const std::string& path)
+        void Reconstruction::WriteImagesBinary(const std::string& path)
+    """
+    images = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_reg_images = read_next_bytes(fid, 8, "Q")[0]
+        for _ in range(num_reg_images):
+            binary_image_properties = read_next_bytes(
+                fid, num_bytes=64, format_char_sequence="idddddddi"
+            )
+            image_id = binary_image_properties[0]
+            qvec = np.array(binary_image_properties[1:5])
+            tvec = np.array(binary_image_properties[5:8])
+            camera_id = binary_image_properties[8]
+            binary_image_name = b""
+            current_char = read_next_bytes(fid, 1, "c")[0]
+            while current_char != b"\x00":  # look for the ASCII 0 entry
+                binary_image_name += current_char
+                current_char = read_next_bytes(fid, 1, "c")[0]
+            image_name = binary_image_name.decode("utf-8")
+            num_points2D = read_next_bytes(fid, num_bytes=8, format_char_sequence="Q")[
+                0
+            ]
+            x_y_id_s = read_next_bytes(
+                fid,
+                num_bytes=24 * num_points2D,
+                format_char_sequence="ddq" * num_points2D,
+            )
+            xys = np.column_stack(
+                [
+                    tuple(map(float, x_y_id_s[0::3])),
+                    tuple(map(float, x_y_id_s[1::3])),
+                ]
+            )
+            point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
+            images[image_id] = Image(
+                id=image_id,
+                qvec=qvec,
+                tvec=tvec,
+                camera_id=camera_id,
+                name=image_name,
+                xys=xys,
+                point3D_ids=point3D_ids,
+            )
+    return images
+
+
+def write_images_text(images, path):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadImagesText(const std::string& path)
+        void Reconstruction::WriteImagesText(const std::string& path)
+    """
+    if len(images) == 0:
+        mean_observations = 0
+    else:
+        mean_observations = sum(
+            (len(img.point3D_ids) for _, img in images.items())
+        ) / len(images)
+    HEADER = (
+        "# Image list with two lines of data per image:\n"
+        + "#   IMAGE_ID, QW, QX, QY, QZ, TX, TY, TZ, CAMERA_ID, NAME\n"
+        + "#   POINTS2D[] as (X, Y, POINT3D_ID)\n"
+        + "# Number of images: {}, mean observations per image: {}\n".format(
+            len(images), mean_observations
+        )
+    )
+
+    with open(path, "w") as fid:
+        fid.write(HEADER)
+        for _, img in images.items():
+            image_header = [
+                img.id,
+                *img.qvec,
+                *img.tvec,
+                img.camera_id,
+                img.name,
+            ]
+            first_line = " ".join(map(str, image_header))
+            fid.write(first_line + "\n")
+
+            points_strings = []
+            for xy, point3D_id in zip(img.xys, img.point3D_ids):
+                points_strings.append(" ".join(map(str, [*xy, point3D_id])))
+            fid.write(" ".join(points_strings) + "\n")
+
+
+def write_images_binary(images, path_to_model_file):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadImagesBinary(const std::string& path)
+        void Reconstruction::WriteImagesBinary(const std::string& path)
+    """
+    with open(path_to_model_file, "wb") as fid:
+        write_next_bytes(fid, len(images), "Q")
+        for _, img in images.items():
+            write_next_bytes(fid, img.id, "i")
+            write_next_bytes(fid, img.qvec.tolist(), "dddd")
+            write_next_bytes(fid, img.tvec.tolist(), "ddd")
+            write_next_bytes(fid, img.camera_id, "i")
+            for char in img.name:
+                write_next_bytes(fid, char.encode("utf-8"), "c")
+            write_next_bytes(fid, b"\x00", "c")
+            write_next_bytes(fid, len(img.point3D_ids), "Q")
+            for xy, p3d_id in zip(img.xys, img.point3D_ids):
+                write_next_bytes(fid, [*xy, p3d_id], "ddq")
+
+
+def read_points3D_text(path):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadPoints3DText(const std::string& path)
+        void Reconstruction::WritePoints3DText(const std::string& path)
+    """
+    points3D = {}
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                point3D_id = int(elems[0])
+                xyz = np.array(tuple(map(float, elems[1:4])))
+                rgb = np.array(tuple(map(int, elems[4:7])))
+                error = float(elems[7])
+                image_ids = np.array(tuple(map(int, elems[8::2])))
+                point2D_idxs = np.array(tuple(map(int, elems[9::2])))
+                points3D[point3D_id] = Point3D(
+                    id=point3D_id,
+                    xyz=xyz,
+                    rgb=rgb,
+                    error=error,
+                    image_ids=image_ids,
+                    point2D_idxs=point2D_idxs,
+                )
+    return points3D
+
+
+def read_points3D_binary(path_to_model_file):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadPoints3DBinary(const std::string& path)
+        void Reconstruction::WritePoints3DBinary(const std::string& path)
+    """
+    points3D = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_points = read_next_bytes(fid, 8, "Q")[0]
+        for _ in range(num_points):
+            binary_point_line_properties = read_next_bytes(
+                fid, num_bytes=43, format_char_sequence="QdddBBBd"
+            )
+            point3D_id = binary_point_line_properties[0]
+            xyz = np.array(binary_point_line_properties[1:4])
+            rgb = np.array(binary_point_line_properties[4:7])
+            error = np.array(binary_point_line_properties[7])
+            track_length = read_next_bytes(fid, num_bytes=8, format_char_sequence="Q")[
+                0
+            ]
+            track_elems = read_next_bytes(
+                fid,
+                num_bytes=8 * track_length,
+                format_char_sequence="ii" * track_length,
+            )
+            image_ids = np.array(tuple(map(int, track_elems[0::2])))
+            point2D_idxs = np.array(tuple(map(int, track_elems[1::2])))
+            points3D[point3D_id] = Point3D(
+                id=point3D_id,
+                xyz=xyz,
+                rgb=rgb,
+                error=error,
+                image_ids=image_ids,
+                point2D_idxs=point2D_idxs,
+            )
+    return points3D
+
+
+def write_points3D_text(points3D, path):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadPoints3DText(const std::string& path)
+        void Reconstruction::WritePoints3DText(const std::string& path)
+    """
+    if len(points3D) == 0:
+        mean_track_length = 0
+    else:
+        mean_track_length = sum(
+            (len(pt.image_ids) for _, pt in points3D.items())
+        ) / len(points3D)
+    HEADER = (
+        "# 3D point list with one line of data per point:\n"
+        + "#   POINT3D_ID, X, Y, Z, R, G, B, ERROR, TRACK[] as (IMAGE_ID, POINT2D_IDX)\n"
+        + "# Number of points: {}, mean track length: {}\n".format(
+            len(points3D), mean_track_length
+        )
+    )
+
+    with open(path, "w") as fid:
+        fid.write(HEADER)
+        for _, pt in points3D.items():
+            point_header = [pt.id, *pt.xyz, *pt.rgb, pt.error]
+            fid.write(" ".join(map(str, point_header)) + " ")
+            track_strings = []
+            for image_id, point2D in zip(pt.image_ids, pt.point2D_idxs):
+                track_strings.append(" ".join(map(str, [image_id, point2D])))
+            fid.write(" ".join(track_strings) + "\n")
+
+
+def write_points3D_binary(points3D, path_to_model_file):
+    """
+    see: src/colmap/scene/reconstruction.cc
+        void Reconstruction::ReadPoints3DBinary(const std::string& path)
+        void Reconstruction::WritePoints3DBinary(const std::string& path)
+    """
+    with open(path_to_model_file, "wb") as fid:
+        write_next_bytes(fid, len(points3D), "Q")
+        for _, pt in points3D.items():
+            write_next_bytes(fid, pt.id, "Q")
+            write_next_bytes(fid, pt.xyz.tolist(), "ddd")
+            write_next_bytes(fid, pt.rgb.tolist(), "BBB")
+            write_next_bytes(fid, pt.error, "d")
+            track_length = pt.image_ids.shape[0]
+            write_next_bytes(fid, track_length, "Q")
+            for image_id, point2D_id in zip(pt.image_ids, pt.point2D_idxs):
+                write_next_bytes(fid, [image_id, point2D_id], "ii")
+
+
+def detect_model_format(path, ext):
+    if (
+        os.path.isfile(os.path.join(path, "cameras" + ext))
+        and os.path.isfile(os.path.join(path, "images" + ext))
+        and os.path.isfile(os.path.join(path, "points3D" + ext))
+    ):
+        print("Detected model format: '" + ext + "'")
+        return True
+
+    return False
+
+
+def read_model(path, ext=""):
+    # try to detect the extension automatically
+    if ext == "":
+        if detect_model_format(path, ".bin"):
+            ext = ".bin"
+        elif detect_model_format(path, ".txt"):
+            ext = ".txt"
+        else:
+            print("Provide model format: '.bin' or '.txt'")
+            return
+
+    if ext == ".txt":
+        cameras = read_cameras_text(os.path.join(path, "cameras" + ext))
+        images = read_images_text(os.path.join(path, "images" + ext))
+        points3D = read_points3D_text(os.path.join(path, "points3D") + ext)
+    else:
+        cameras = read_cameras_binary(os.path.join(path, "cameras" + ext))
+        images = read_images_binary(os.path.join(path, "images" + ext))
+        points3D = read_points3D_binary(os.path.join(path, "points3D") + ext)
+    return cameras, images, points3D
+
+
+def write_model(cameras, images, points3D, path, ext=".bin"):
+    if ext == ".txt":
+        write_cameras_text(cameras, os.path.join(path, "cameras" + ext))
+        write_images_text(images, os.path.join(path, "images" + ext))
+        write_points3D_text(points3D, os.path.join(path, "points3D") + ext)
+    else:
+        write_cameras_binary(cameras, os.path.join(path, "cameras" + ext))
+        write_images_binary(images, os.path.join(path, "images" + ext))
+        write_points3D_binary(points3D, os.path.join(path, "points3D") + ext)
+    return cameras, images, points3D
+
+
+def qvec2rotmat(qvec):
+    return np.array(
+        [
+            [
+                1 - 2 * qvec[2] ** 2 - 2 * qvec[3] ** 2,
+                2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
+                2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2],
+            ],
+            [
+                2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
+                1 - 2 * qvec[1] ** 2 - 2 * qvec[3] ** 2,
+                2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1],
+            ],
+            [
+                2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
+                2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
+                1 - 2 * qvec[1] ** 2 - 2 * qvec[2] ** 2,
+            ],
+        ]
+    )
+
+
+def rotmat2qvec(R):
+    Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
+    K = (
+        np.array(
+            [
+                [Rxx - Ryy - Rzz, 0, 0, 0],
+                [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
+                [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
+                [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz],
+            ]
+        )
+        / 3.0
+    )
+    eigvals, eigvecs = np.linalg.eigh(K)
+    qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
+    if qvec[0] < 0:
+        qvec *= -1
+    return qvec
+
+
+def run(input_model, output_model):
+    if (
+        os.path.exists(os.path.join(output_model, "cameras.txt"))
+        and os.path.exists(os.path.join(output_model, "images.txt"))
+        and os.path.exists(os.path.join(output_model, "points3D.txt"))
+    ):
+        print("Model already exists")
+        return
+    cameras, images, points3D = read_model(path=input_model, ext=".bin")
+    # print("num_cameras:", len(cameras))
+    # print("num_images:", len(images))
+    # print("num_points3D:", len(points3D))
+    write_model(
+        cameras,
+        images,
+        points3D,
+        path=output_model,
+        ext=".txt",
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Read and write COLMAP binary and text models"
+    )
+    parser.add_argument("--input_model", default="")
+    parser.add_argument(
+        "--input_format",
+        choices=[".bin", ".txt"],
+        help="input model format",
+        default=".bin",
+    )
+    parser.add_argument("--output_model", default=".")
+    parser.add_argument(
+        "--output_format",
+        choices=[".bin", ".txt"],
+        help="output model format",
+        default=".txt",
+    )
+    args = parser.parse_args()
+
+    cameras, images, points3D = read_model(path=args.input_model, ext=args.input_format)
+
+    print("num_cameras:", len(cameras))
+    print("num_images:", len(images))
+    print("num_points3D:", len(points3D))
+
+    if args.output_model is not None:
+        write_model(
+            cameras,
+            images,
+            points3D,
+            path=args.output_model,
+            ext=args.output_format,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/datasets_preprocess/utils/cropping.py b/extern/CUT3R/datasets_preprocess/utils/cropping.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa3ac5daceaced49764d1902a9e950e404502692
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/utils/cropping.py
@@ -0,0 +1,169 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# croppping utilities
+# --------------------------------------------------------
+import PIL.Image
+import os
+
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2  # noqa
+import numpy as np  # noqa
+
+try:
+    lanczos = PIL.Image.Resampling.LANCZOS
+    bicubic = PIL.Image.Resampling.BICUBIC
+except AttributeError:
+    lanczos = PIL.Image.LANCZOS
+    bicubic = PIL.Image.BICUBIC
+
+
+def colmap_to_opencv_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] -= 0.5
+    K[1, 2] -= 0.5
+    return K
+
+
+def opencv_to_colmap_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] += 0.5
+    K[1, 2] += 0.5
+    return K
+
+
+class ImageList:
+    """Convenience class to aply the same operation to a whole set of images."""
+
+    def __init__(self, images):
+        if not isinstance(images, (tuple, list, set)):
+            images = [images]
+        self.images = []
+        for image in images:
+            if not isinstance(image, PIL.Image.Image):
+                image = PIL.Image.fromarray(image)
+            self.images.append(image)
+
+    def __len__(self):
+        return len(self.images)
+
+    def to_pil(self):
+        return tuple(self.images) if len(self.images) > 1 else self.images[0]
+
+    @property
+    def size(self):
+        sizes = [im.size for im in self.images]
+        assert all(sizes[0] == s for s in sizes)
+        return sizes[0]
+
+    def resize(self, *args, **kwargs):
+        return ImageList(self._dispatch("resize", *args, **kwargs))
+
+    def crop(self, *args, **kwargs):
+        return ImageList(self._dispatch("crop", *args, **kwargs))
+
+    def _dispatch(self, func, *args, **kwargs):
+        return [getattr(im, func)(*args, **kwargs) for im in self.images]
+
+
+def rescale_image_depthmap(
+    image, depthmap, camera_intrinsics, output_resolution, force=True
+):
+    """Jointly rescale a (image, depthmap)
+    so that (out_width, out_height) >= output_res
+    """
+    image = ImageList(image)
+    input_resolution = np.array(image.size)  # (W,H)
+    output_resolution = np.array(output_resolution)
+    if depthmap is not None:
+        # can also use this with masks instead of depthmaps
+        assert tuple(depthmap.shape[:2]) == image.size[::-1]
+
+    # define output resolution
+    assert output_resolution.shape == (2,)
+    scale_final = max(output_resolution / image.size) + 1e-8
+    if scale_final >= 1 and not force:  # image is already smaller than what is asked
+        return (image.to_pil(), depthmap, camera_intrinsics)
+    output_resolution = np.floor(input_resolution * scale_final).astype(int)
+
+    # first rescale the image so that it contains the crop
+    image = image.resize(
+        tuple(output_resolution), resample=lanczos if scale_final < 1 else bicubic
+    )
+    if depthmap is not None:
+        depthmap = cv2.resize(
+            depthmap,
+            output_resolution,
+            fx=scale_final,
+            fy=scale_final,
+            interpolation=cv2.INTER_NEAREST,
+        )
+
+    # no offset here; simple rescaling
+    camera_intrinsics = camera_matrix_of_crop(
+        camera_intrinsics, input_resolution, output_resolution, scaling=scale_final
+    )
+
+    return image.to_pil(), depthmap, camera_intrinsics
+
+
+def camera_matrix_of_crop(
+    input_camera_matrix,
+    input_resolution,
+    output_resolution,
+    scaling=1,
+    offset_factor=0.5,
+    offset=None,
+):
+    # Margins to offset the origin
+    margins = np.asarray(input_resolution) * scaling - output_resolution
+    assert np.all(margins >= 0.0)
+    if offset is None:
+        offset = offset_factor * margins
+
+    # Generate new camera parameters
+    output_camera_matrix_colmap = opencv_to_colmap_intrinsics(input_camera_matrix)
+    output_camera_matrix_colmap[:2, :] *= scaling
+    output_camera_matrix_colmap[:2, 2] -= offset
+    output_camera_matrix = colmap_to_opencv_intrinsics(output_camera_matrix_colmap)
+
+    return output_camera_matrix
+
+
+def crop_image_depthmap(image, depthmap, camera_intrinsics, crop_bbox):
+    """
+    Return a crop of the input view.
+    """
+    image = ImageList(image)
+    l, t, r, b = crop_bbox
+
+    image = image.crop((l, t, r, b))
+    depthmap = depthmap[t:b, l:r]
+
+    camera_intrinsics = camera_intrinsics.copy()
+    camera_intrinsics[0, 2] -= l
+    camera_intrinsics[1, 2] -= t
+
+    return image.to_pil(), depthmap, camera_intrinsics
+
+
+def bbox_from_intrinsics_in_out(
+    input_camera_matrix, output_camera_matrix, output_resolution
+):
+    out_width, out_height = output_resolution
+    l, t = np.int32(np.round(input_camera_matrix[:2, 2] - output_camera_matrix[:2, 2]))
+    crop_bbox = (l, t, l + out_width, t + out_height)
+    return crop_bbox
diff --git a/extern/CUT3R/datasets_preprocess/utils/parallel.py b/extern/CUT3R/datasets_preprocess/utils/parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fa1012c820a1aecf56282f76029fdc2086b7b6b
--- /dev/null
+++ b/extern/CUT3R/datasets_preprocess/utils/parallel.py
@@ -0,0 +1,90 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions for multiprocessing
+# --------------------------------------------------------
+from tqdm import tqdm
+from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing import cpu_count
+
+
+def parallel_threads(
+    function,
+    args,
+    workers=0,
+    star_args=False,
+    kw_args=False,
+    front_num=1,
+    Pool=ThreadPool,
+    **tqdm_kw
+):
+    """tqdm but with parallel execution.
+
+    Will essentially return
+      res = [ function(arg) # default
+              function(*arg) # if star_args is True
+              function(**arg) # if kw_args is True
+              for arg in args]
+
+    Note:
+        the <front_num> first elements of args will not be parallelized.
+        This can be useful for debugging.
+    """
+    while workers <= 0:
+        workers += cpu_count()
+    if workers == 1:
+        front_num = float("inf")
+
+    # convert into an iterable
+    try:
+        n_args_parallel = len(args) - front_num
+    except TypeError:
+        n_args_parallel = None
+    args = iter(args)
+
+    # sequential execution first
+    front = []
+    while len(front) < front_num:
+        try:
+            a = next(args)
+        except StopIteration:
+            return front  # end of the iterable
+        front.append(
+            function(*a) if star_args else function(**a) if kw_args else function(a)
+        )
+
+    # then parallel execution
+    out = []
+    with Pool(workers) as pool:
+        # Pass the elements of args into function
+        if star_args:
+            futures = pool.imap(starcall, [(function, a) for a in args])
+        elif kw_args:
+            futures = pool.imap(starstarcall, [(function, a) for a in args])
+        else:
+            futures = pool.imap(function, args)
+        # Print out the progress as tasks complete
+        for f in tqdm(futures, total=n_args_parallel, **tqdm_kw):
+            out.append(f)
+    return front + out
+
+
+def parallel_processes(*args, **kwargs):
+    """Same as parallel_threads, with processes"""
+    import multiprocessing as mp
+
+    kwargs["Pool"] = mp.Pool
+    return parallel_threads(*args, **kwargs)
+
+
+def starcall(args):
+    """convenient wrapper for Process.Pool"""
+    function, args = args
+    return function(*args)
+
+
+def starstarcall(args):
+    """convenient wrapper for Process.Pool"""
+    function, args = args
+    return function(**args)
diff --git a/extern/CUT3R/demo.py b/extern/CUT3R/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..df60dbd62ec06dd3e8789661070b92333edf5e28
--- /dev/null
+++ b/extern/CUT3R/demo.py
@@ -0,0 +1,422 @@
+#!/usr/bin/env python3
+"""
+3D Point Cloud Inference and Visualization Script
+
+This script performs inference using the ARCroco3DStereo model and visualizes the
+resulting 3D point clouds with the PointCloudViewer. Use the command-line arguments
+to adjust parameters such as the model checkpoint path, image sequence directory,
+image size, device, etc.
+
+Usage:
+    python demo.py [--model_path MODEL_PATH] [--seq_path SEQ_PATH] [--size IMG_SIZE]
+                            [--device DEVICE] [--vis_threshold VIS_THRESHOLD] [--output_dir OUT_DIR]
+
+Example:
+    python demo.py --model_path src/cut3r_512_dpt_4_64.pth \
+        --seq_path examples/001 --device cuda --size 512
+"""
+
+import os
+import numpy as np
+import torch
+import time
+import glob
+import random
+import cv2
+import argparse
+import tempfile
+import shutil
+from copy import deepcopy
+from add_ckpt_path import add_path_to_dust3r
+import imageio.v2 as iio
+
+# Set random seed for reproducibility.
+random.seed(42)
+
+
+def parse_args():
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Run 3D point cloud inference and visualization using ARCroco3DStereo."
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="src/cut3r_512_dpt_4_64.pth",
+        help="Path to the pretrained model checkpoint.",
+    )
+    parser.add_argument(
+        "--seq_path",
+        type=str,
+        default="",
+        help="Path to the directory containing the image sequence.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Device to run inference on (e.g., 'cuda' or 'cpu').",
+    )
+    parser.add_argument(
+        "--size",
+        type=int,
+        default="512",
+        help="Shape that input images will be rescaled to; if using 224+linear model, choose 224 otherwise 512",
+    )
+    parser.add_argument(
+        "--vis_threshold",
+        type=float,
+        default=1.5,
+        help="Visualization threshold for the point cloud viewer. Ranging from 1 to INF",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./demo_tmp",
+        help="value for tempfile.tempdir",
+    )
+
+    return parser.parse_args()
+
+
+def prepare_input(
+    img_paths, img_mask, size, raymaps=None, raymap_mask=None, revisit=1, update=True
+):
+    """
+    Prepare input views for inference from a list of image paths.
+
+    Args:
+        img_paths (list): List of image file paths.
+        img_mask (list of bool): Flags indicating valid images.
+        size (int): Target image size.
+        raymaps (list, optional): List of ray maps.
+        raymap_mask (list, optional): Flags indicating valid ray maps.
+        revisit (int): How many times to revisit each view.
+        update (bool): Whether to update the state on revisits.
+
+    Returns:
+        list: A list of view dictionaries.
+    """
+    # Import image loader (delayed import needed after adding ckpt path).
+    from src.dust3r.utils.image import load_images
+
+    images = load_images(img_paths, size=size)
+    views = []
+
+    if raymaps is None and raymap_mask is None:
+        # Only images are provided.
+        for i in range(len(images)):
+            view = {
+                "img": images[i]["img"],
+                "ray_map": torch.full(
+                    (
+                        images[i]["img"].shape[0],
+                        6,
+                        images[i]["img"].shape[-2],
+                        images[i]["img"].shape[-1],
+                    ),
+                    torch.nan,
+                ),
+                "true_shape": torch.from_numpy(images[i]["true_shape"]),
+                "idx": i,
+                "instance": str(i),
+                "camera_pose": torch.from_numpy(np.eye(4, dtype=np.float32)).unsqueeze(
+                    0
+                ),
+                "img_mask": torch.tensor(True).unsqueeze(0),
+                "ray_mask": torch.tensor(False).unsqueeze(0),
+                "update": torch.tensor(True).unsqueeze(0),
+                "reset": torch.tensor(False).unsqueeze(0),
+            }
+            views.append(view)
+    else:
+        # Combine images and raymaps.
+        num_views = len(images) + len(raymaps)
+        assert len(img_mask) == len(raymap_mask) == num_views
+        assert sum(img_mask) == len(images) and sum(raymap_mask) == len(raymaps)
+
+        j = 0
+        k = 0
+        for i in range(num_views):
+            view = {
+                "img": (
+                    images[j]["img"]
+                    if img_mask[i]
+                    else torch.full_like(images[0]["img"], torch.nan)
+                ),
+                "ray_map": (
+                    raymaps[k]
+                    if raymap_mask[i]
+                    else torch.full_like(raymaps[0], torch.nan)
+                ),
+                "true_shape": (
+                    torch.from_numpy(images[j]["true_shape"])
+                    if img_mask[i]
+                    else torch.from_numpy(np.int32([raymaps[k].shape[1:-1][::-1]]))
+                ),
+                "idx": i,
+                "instance": str(i),
+                "camera_pose": torch.from_numpy(np.eye(4, dtype=np.float32)).unsqueeze(
+                    0
+                ),
+                "img_mask": torch.tensor(img_mask[i]).unsqueeze(0),
+                "ray_mask": torch.tensor(raymap_mask[i]).unsqueeze(0),
+                "update": torch.tensor(img_mask[i]).unsqueeze(0),
+                "reset": torch.tensor(False).unsqueeze(0),
+            }
+            if img_mask[i]:
+                j += 1
+            if raymap_mask[i]:
+                k += 1
+            views.append(view)
+        assert j == len(images) and k == len(raymaps)
+
+    if revisit > 1:
+        new_views = []
+        for r in range(revisit):
+            for i, view in enumerate(views):
+                new_view = deepcopy(view)
+                new_view["idx"] = r * len(views) + i
+                new_view["instance"] = str(r * len(views) + i)
+                if r > 0 and not update:
+                    new_view["update"] = torch.tensor(False).unsqueeze(0)
+                new_views.append(new_view)
+        return new_views
+
+    return views
+
+
+def prepare_output(outputs, outdir, revisit=1, use_pose=True):
+    """
+    Process inference outputs to generate point clouds and camera parameters for visualization.
+
+    Args:
+        outputs (dict): Inference outputs.
+        revisit (int): Number of revisits per view.
+        use_pose (bool): Whether to transform points using camera pose.
+
+    Returns:
+        tuple: (points, colors, confidence, camera parameters dictionary)
+    """
+    from src.dust3r.utils.camera import pose_encoding_to_camera
+    from src.dust3r.post_process import estimate_focal_knowing_depth
+    from src.dust3r.utils.geometry import geotrf
+
+    # Only keep the outputs corresponding to one full pass.
+    valid_length = len(outputs["pred"]) // revisit
+    outputs["pred"] = outputs["pred"][-valid_length:]
+    outputs["views"] = outputs["views"][-valid_length:]
+
+    pts3ds_self_ls = [output["pts3d_in_self_view"].cpu() for output in outputs["pred"]]
+    pts3ds_other = [output["pts3d_in_other_view"].cpu() for output in outputs["pred"]]
+    conf_self = [output["conf_self"].cpu() for output in outputs["pred"]]
+    conf_other = [output["conf"].cpu() for output in outputs["pred"]]
+    pts3ds_self = torch.cat(pts3ds_self_ls, 0)
+
+    # Recover camera poses.
+    pr_poses = [
+        pose_encoding_to_camera(pred["camera_pose"].clone()).cpu()
+        for pred in outputs["pred"]
+    ]
+    R_c2w = torch.cat([pr_pose[:, :3, :3] for pr_pose in pr_poses], 0)
+    t_c2w = torch.cat([pr_pose[:, :3, 3] for pr_pose in pr_poses], 0)
+
+    if use_pose:
+        transformed_pts3ds_other = []
+        for pose, pself in zip(pr_poses, pts3ds_self):
+            transformed_pts3ds_other.append(geotrf(pose, pself.unsqueeze(0)))
+        pts3ds_other = transformed_pts3ds_other
+        conf_other = conf_self
+
+    # Estimate focal length based on depth.
+    B, H, W, _ = pts3ds_self.shape
+    pp = torch.tensor([W // 2, H // 2], device=pts3ds_self.device).float().repeat(B, 1)
+    focal = estimate_focal_knowing_depth(pts3ds_self, pp, focal_mode="weiszfeld")
+
+    colors = [
+        0.5 * (output["img"].permute(0, 2, 3, 1) + 1.0) for output in outputs["views"]
+    ]
+
+    cam_dict = {
+        "focal": focal.cpu().numpy(),
+        "pp": pp.cpu().numpy(),
+        "R": R_c2w.cpu().numpy(),
+        "t": t_c2w.cpu().numpy(),
+    }
+
+    pts3ds_self_tosave = pts3ds_self  # B, H, W, 3
+    depths_tosave = pts3ds_self_tosave[..., 2]
+    pts3ds_other_tosave = torch.cat(pts3ds_other)  # B, H, W, 3
+    conf_self_tosave = torch.cat(conf_self)  # B, H, W
+    conf_other_tosave = torch.cat(conf_other)  # B, H, W
+    colors_tosave = torch.cat(
+        [
+            0.5 * (output["img"].permute(0, 2, 3, 1).cpu() + 1.0)
+            for output in outputs["views"]
+        ]
+    )  # [B, H, W, 3]
+    cam2world_tosave = torch.cat(pr_poses)  # B, 4, 4
+    intrinsics_tosave = (
+        torch.eye(3).unsqueeze(0).repeat(cam2world_tosave.shape[0], 1, 1)
+    )  # B, 3, 3
+    intrinsics_tosave[:, 0, 0] = focal.detach().cpu()
+    intrinsics_tosave[:, 1, 1] = focal.detach().cpu()
+    intrinsics_tosave[:, 0, 2] = pp[:, 0]
+    intrinsics_tosave[:, 1, 2] = pp[:, 1]
+
+    os.makedirs(os.path.join(outdir, "depth"), exist_ok=True)
+    os.makedirs(os.path.join(outdir, "conf"), exist_ok=True)
+    os.makedirs(os.path.join(outdir, "color"), exist_ok=True)
+    os.makedirs(os.path.join(outdir, "camera"), exist_ok=True)
+    for f_id in range(len(pts3ds_self)):
+        depth = depths_tosave[f_id].cpu().numpy()
+        conf = conf_self_tosave[f_id].cpu().numpy()
+        color = colors_tosave[f_id].cpu().numpy()
+        c2w = cam2world_tosave[f_id].cpu().numpy()
+        intrins = intrinsics_tosave[f_id].cpu().numpy()
+        np.save(os.path.join(outdir, "depth", f"{f_id:06d}.npy"), depth)
+        np.save(os.path.join(outdir, "conf", f"{f_id:06d}.npy"), conf)
+        iio.imwrite(
+            os.path.join(outdir, "color", f"{f_id:06d}.png"),
+            (color * 255).astype(np.uint8),
+        )
+        np.savez(
+            os.path.join(outdir, "camera", f"{f_id:06d}.npz"),
+            pose=c2w,
+            intrinsics=intrins,
+        )
+
+    return pts3ds_other, colors, conf_other, cam_dict
+
+
+def parse_seq_path(p):
+    if os.path.isdir(p):
+        img_paths = sorted(glob.glob(f"{p}/*"))
+        tmpdirname = None
+    else:
+        cap = cv2.VideoCapture(p)
+        if not cap.isOpened():
+            raise ValueError(f"Error opening video file {p}")
+        video_fps = cap.get(cv2.CAP_PROP_FPS)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        if video_fps == 0:
+            cap.release()
+            raise ValueError(f"Error: Video FPS is 0 for {p}")
+        frame_interval = 1
+        frame_indices = list(range(0, total_frames, frame_interval))
+        print(
+            f" - Video FPS: {video_fps}, Frame Interval: {frame_interval}, Total Frames to Read: {len(frame_indices)}"
+        )
+        img_paths = []
+        tmpdirname = tempfile.mkdtemp()
+        for i in frame_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame_path = os.path.join(tmpdirname, f"frame_{i}.jpg")
+            cv2.imwrite(frame_path, frame)
+            img_paths.append(frame_path)
+        cap.release()
+    return img_paths, tmpdirname
+
+
+def run_inference(args):
+    """
+    Execute the full inference and visualization pipeline.
+
+    Args:
+        args: Parsed command-line arguments.
+    """
+    # Set up the computation device.
+    device = args.device
+    if device == "cuda" and not torch.cuda.is_available():
+        print("CUDA not available. Switching to CPU.")
+        device = "cpu"
+
+    # Add the checkpoint path (required for model imports in the dust3r package).
+    add_path_to_dust3r(args.model_path)
+
+    # Import model and inference functions after adding the ckpt path.
+    from src.dust3r.inference import inference, inference_recurrent
+    from src.dust3r.model import ARCroco3DStereo
+    from viser_utils import PointCloudViewer
+
+    # Prepare image file paths.
+    img_paths, tmpdirname = parse_seq_path(args.seq_path)
+    if not img_paths:
+        print(f"No images found in {args.seq_path}. Please verify the path.")
+        return
+
+    print(f"Found {len(img_paths)} images in {args.seq_path}.")
+    img_mask = [True] * len(img_paths)
+
+    # Prepare input views.
+    print("Preparing input views...")
+    views = prepare_input(
+        img_paths=img_paths,
+        img_mask=img_mask,
+        size=args.size,
+        revisit=1,
+        update=True,
+    )
+    if tmpdirname is not None:
+        shutil.rmtree(tmpdirname)
+
+    # Load and prepare the model.
+    print(f"Loading model from {args.model_path}...")
+    model = ARCroco3DStereo.from_pretrained(args.model_path).to(device)
+    model.eval()
+
+    # Run inference.
+    print("Running inference...")
+    start_time = time.time()
+    outputs, state_args = inference(views, model, device)
+    total_time = time.time() - start_time
+    per_frame_time = total_time / len(views)
+    print(
+        f"Inference completed in {total_time:.2f} seconds (average {per_frame_time:.2f} s per frame)."
+    )
+
+    # Process outputs for visualization.
+    print("Preparing output for visualization...")
+    pts3ds_other, colors, conf, cam_dict = prepare_output(
+        outputs, args.output_dir, 1, True
+    )
+
+    # Convert tensors to numpy arrays for visualization.
+    pts3ds_to_vis = [p.cpu().numpy() for p in pts3ds_other]
+    colors_to_vis = [c.cpu().numpy() for c in colors]
+    edge_colors = [None] * len(pts3ds_to_vis)
+
+    # Create and run the point cloud viewer.
+    print("Launching point cloud viewer...")
+    viewer = PointCloudViewer(
+        model,
+        state_args,
+        pts3ds_to_vis,
+        colors_to_vis,
+        conf,
+        cam_dict,
+        device=device,
+        edge_color_list=edge_colors,
+        show_camera=True,
+        vis_threshold=args.vis_threshold,
+        size = args.size
+    )
+    viewer.run()
+
+
+def main():
+    args = parse_args()
+    if not args.seq_path:
+        print(
+            "No inputs found! Please use our gradio demo if you would like to iteractively upload inputs."
+        )
+        return
+    else:
+        run_inference(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/demo_ga.py b/extern/CUT3R/demo_ga.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ea1c63a932e3bf64d8d714526a70c85b23064ca
--- /dev/null
+++ b/extern/CUT3R/demo_ga.py
@@ -0,0 +1,444 @@
+#!/usr/bin/env python3
+"""
+3D Point Cloud Inference and Visualization Script
+
+This script performs inference using the ARCroco3DStereo model and visualizes the
+resulting 3D point clouds with the PointCloudViewer. Use the command-line arguments
+to adjust parameters such as the model checkpoint path, image sequence directory,
+image size, device, etc.
+
+Usage:
+    python demo_ga.py [--model_path MODEL_PATH] [--seq_path SEQ_PATH] [--size IMG_SIZE]
+                            [--device DEVICE] [--vis_threshold VIS_THRESHOLD] [--output_dir OUT_DIR]
+
+Example:
+    python demo_ga.py --model_path src/cut3r_512_dpt_4_64.pth \
+        --seq_path examples/001 --device cuda --size 512
+"""
+
+import os
+import numpy as np
+import torch
+import time
+import glob
+import random
+import cv2
+import argparse
+import tempfile
+import shutil
+from copy import deepcopy
+from add_ckpt_path import add_path_to_dust3r
+import imageio.v2 as iio
+from PIL import Image
+
+# Set random seed for reproducibility.
+random.seed(42)
+
+def forward_backward_permutations(n, interval=1):
+    original = list(range(n))
+    result = [original]
+    for i in range(1, n):
+        new_list = original[i::interval]
+        result.append(new_list)
+        new_list = original[: i + 1][::-interval]
+        result.append(new_list)
+    return result
+
+def listify(elems):
+    return [x for e in elems for x in e]
+
+
+def collate_with_cat(whatever, lists=False):
+    if isinstance(whatever, dict):
+        return {k: collate_with_cat(vals, lists=lists) for k, vals in whatever.items()}
+
+    elif isinstance(whatever, (tuple, list)):
+        if len(whatever) == 0:
+            return whatever
+        elem = whatever[0]
+        T = type(whatever)
+
+        if elem is None:
+            return None
+        if isinstance(elem, (bool, float, int, str)):
+            return whatever
+        if isinstance(elem, tuple):
+            return T(collate_with_cat(x, lists=lists) for x in zip(*whatever))
+        if isinstance(elem, dict):
+            return {
+                k: collate_with_cat([e[k] for e in whatever], lists=lists) for k in elem
+            }
+
+        if isinstance(elem, torch.Tensor):
+            return listify(whatever) if lists else torch.cat(whatever)
+        if isinstance(elem, np.ndarray):
+            return (
+                listify(whatever)
+                if lists
+                else torch.cat([torch.from_numpy(x) for x in whatever])
+            )
+
+        # otherwise, we just chain lists
+        return sum(whatever, T())
+
+
+def parse_args():
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Run 3D point cloud inference and visualization using ARCroco3DStereo."
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="src/cut3r_512_dpt_4_64.pth",
+        help="Path to the pretrained model checkpoint.",
+    )
+    parser.add_argument(
+        "--seq_path",
+        type=str,
+        default="",
+        help="Path to the directory containing the image sequence.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Device to run inference on (e.g., 'cuda' or 'cpu').",
+    )
+    parser.add_argument(
+        "--size",
+        type=int,
+        default="512",
+        help="Shape that input images will be rescaled to; if using 224+linear model, choose 224 otherwise 512",
+    )
+    parser.add_argument(
+        "--vis_threshold",
+        type=float,
+        default=1.5,
+        help="Visualization threshold for the point cloud viewer. Ranging from 1 to INF",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./demo_tmp",
+        help="value for tempfile.tempdir",
+    )
+
+    return parser.parse_args()
+
+
+def prepare_input(
+    img_paths, img_mask, size, raymaps=None, raymap_mask=None, revisit=1, update=True
+):
+    """
+    Prepare input views for inference from a list of image paths.
+
+    Args:
+        img_paths (list): List of image file paths.
+        img_mask (list of bool): Flags indicating valid images.
+        size (int): Target image size.
+        raymaps (list, optional): List of ray maps.
+        raymap_mask (list, optional): Flags indicating valid ray maps.
+        revisit (int): How many times to revisit each view.
+        update (bool): Whether to update the state on revisits.
+
+    Returns:
+        list: A list of view dictionaries.
+    """
+    # Import image loader (delayed import needed after adding ckpt path).
+    from src.dust3r.utils.image import load_images
+
+    images = load_images(img_paths, size=size)
+    views = []
+    num_views = len(images)
+    all_permutations = forward_backward_permutations(num_views, interval=2)
+    for permute in all_permutations:
+        _views = []
+        for idx, i in enumerate(permute):
+            view = {
+                "img": images[i]["img"],
+                "ray_map": torch.full(
+                    (
+                        images[i]["img"].shape[0],
+                        6,
+                        images[i]["img"].shape[-2],
+                        images[i]["img"].shape[-1],
+                    ),
+                    torch.nan,
+                ),
+                "true_shape": torch.from_numpy(images[i]["true_shape"]),
+                "idx": i,
+                "instance": str(i),
+                "camera_pose": torch.from_numpy(np.eye(4).astype(np.float32)).unsqueeze(
+                    0
+                ),
+                "img_mask": torch.tensor(True).unsqueeze(0),
+                "ray_mask": torch.tensor(False).unsqueeze(0),
+                "update": torch.tensor(True).unsqueeze(0),
+                "reset": torch.tensor(False).unsqueeze(0),
+            }
+            _views.append(view)
+        views.append(_views)
+    return views
+
+
+
+
+
+def prepare_output(output, outdir, device):
+    from cloud_opt.dust3r_opt import global_aligner, GlobalAlignerMode
+
+    with torch.enable_grad():
+        mode = GlobalAlignerMode.PointCloudOptimizer
+        scene = global_aligner(
+            output,
+            device=device,
+            mode=mode,
+            verbose=True,
+        )
+        lr = 0.01
+        loss = scene.compute_global_alignment(
+            init="mst",
+            niter=300,
+            schedule="linear",
+            lr=lr,
+        )
+    scene.clean_pointcloud()
+    pts3d = scene.get_pts3d()
+    depths = scene.get_depthmaps()
+    poses = scene.get_im_poses()
+    focals = scene.get_focals()
+    pps = scene.get_principal_points()
+    confs = scene.get_conf(mode="none")
+
+    pts3ds_other = [pts.detach().cpu().unsqueeze(0) for pts in pts3d]
+    depths = [d.detach().cpu().unsqueeze(0) for d in depths]
+    colors = [torch.from_numpy(img).unsqueeze(0) for img in scene.imgs]
+    confs = [conf.detach().cpu().unsqueeze(0) for conf in confs]
+    cam_dict = {
+        "focal": focals.detach().cpu().numpy(),
+        "pp": pps.detach().cpu().numpy(),
+        "R": poses.detach().cpu().numpy()[..., :3, :3],
+        "t": poses.detach().cpu().numpy()[..., :3, 3],
+    }
+
+    depths_tosave = torch.cat(depths)  # B, H, W
+    pts3ds_other_tosave = torch.cat(pts3ds_other)  # B, H, W, 3
+    conf_self_tosave = torch.cat(confs)  # B, H, W
+    colors_tosave = torch.cat(colors)  # [B, H, W, 3]
+    cam2world_tosave = poses.detach().cpu()  # B, 4, 4
+    intrinsics_tosave = (
+        torch.eye(3).unsqueeze(0).repeat(cam2world_tosave.shape[0], 1, 1)
+    )  # B, 3, 3
+    intrinsics_tosave[:, 0, 0] = focals[:, 0].detach().cpu()
+    intrinsics_tosave[:, 1, 1] = focals[:, 0].detach().cpu()
+    intrinsics_tosave[:, 0, 2] = pps[:, 0].detach().cpu()
+    intrinsics_tosave[:, 1, 2] = pps[:, 1].detach().cpu()
+
+    os.makedirs(os.path.join(outdir, "depth"), exist_ok=True)
+    os.makedirs(os.path.join(outdir, "conf"), exist_ok=True)
+    os.makedirs(os.path.join(outdir, "color"), exist_ok=True)
+    os.makedirs(os.path.join(outdir, "camera"), exist_ok=True)
+    for f_id in range(len(depths_tosave)):
+        depth = depths_tosave[f_id].cpu().numpy()
+        conf = conf_self_tosave[f_id].cpu().numpy()
+        color = colors_tosave[f_id].cpu().numpy()
+        c2w = cam2world_tosave[f_id].cpu().numpy()
+        intrins = intrinsics_tosave[f_id].cpu().numpy()
+        np.save(os.path.join(outdir, "depth", f"{f_id:06d}.npy"), depth)
+        np.save(os.path.join(outdir, "conf", f"{f_id:06d}.npy"), conf)
+        iio.imwrite(
+            os.path.join(outdir, "color", f"{f_id:06d}.png"),
+            (color * 255).astype(np.uint8),
+        )
+        np.savez(
+            os.path.join(outdir, "camera", f"{f_id:06d}.npz"),
+            pose=c2w,
+            intrinsics=intrins,
+        )
+
+    return pts3ds_other, colors, confs, cam_dict
+
+
+def parse_seq_path(p):
+    if os.path.isdir(p):
+        img_paths = sorted(glob.glob(f"{p}/*"))
+        tmpdirname = None
+    else:
+        cap = cv2.VideoCapture(p)
+        if not cap.isOpened():
+            raise ValueError(f"Error opening video file {p}")
+        video_fps = cap.get(cv2.CAP_PROP_FPS)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        if video_fps == 0:
+            cap.release()
+            raise ValueError(f"Error: Video FPS is 0 for {p}")
+        frame_interval = 1
+        frame_indices = list(range(0, total_frames, frame_interval))
+        print(
+            f" - Video FPS: {video_fps}, Frame Interval: {frame_interval}, Total Frames to Read: {len(frame_indices)}"
+        )
+        img_paths = []
+        tmpdirname = tempfile.mkdtemp()
+        for i in frame_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame_path = os.path.join(tmpdirname, f"frame_{i}.jpg")
+            cv2.imwrite(frame_path, frame)
+            img_paths.append(frame_path)
+        cap.release()
+    return img_paths, tmpdirname
+
+
+
+
+def run_inference(args):
+    """
+    Execute the full inference and visualization pipeline.
+
+    Args:
+        args: Parsed command-line arguments.
+    """
+    # Set up the computation device.
+    device = args.device
+    if device == "cuda" and not torch.cuda.is_available():
+        print("CUDA not available. Switching to CPU.")
+        device = "cpu"
+
+    # Add the checkpoint path (required for model imports in the dust3r package).
+    add_path_to_dust3r(args.model_path)
+
+    # Import model and inference functions after adding the ckpt path.
+    from src.dust3r.inference import inference, inference_recurrent
+    from src.dust3r.model import ARCroco3DStereo
+    from viser_utils import PointCloudViewer
+
+    # Prepare image file paths.
+    img_paths, tmpdirname = parse_seq_path(args.seq_path)
+    if not img_paths:
+        print(f"No images found in {args.seq_path}. Please verify the path.")
+        return
+
+    print(f"Found {len(img_paths)} images in {args.seq_path}.")
+    img_mask = [True] * len(img_paths)
+
+    # Prepare input views.
+    print("Preparing input views...")
+    views = prepare_input(
+        img_paths=img_paths,
+        img_mask=img_mask,
+        size=args.size,
+        revisit=1,
+        update=True,
+    )
+    if tmpdirname is not None:
+        shutil.rmtree(tmpdirname)
+
+    # Load and prepare the model.
+    print(f"Loading model from {args.model_path}...")
+    model = ARCroco3DStereo.from_pretrained(args.model_path).to(device)
+    model.eval()
+
+    # Run inference.
+    print("Running inference...")
+    start_time = time.time()
+    output = {
+        "view1": [],
+        "view2": [],
+        "pred1": [],
+        "pred2": [],
+    }
+    edges = []
+    for _views in views:
+        outputs, state_args = inference(_views, model, device)
+        for view_id in range(1, len(outputs["views"])):
+            output["view1"].append(outputs["views"][0])
+            output["view2"].append(outputs["views"][view_id])
+            output["pred1"].append(outputs["pred"][0])
+            output["pred2"].append(outputs["pred"][view_id])
+
+            edges.append((outputs["views"][0]["idx"], outputs["views"][view_id]["idx"]))
+    list_of_tuples = edges
+    sorted_indices = sorted(
+        range(len(list_of_tuples)),
+        key=lambda x: (
+            list_of_tuples[x][0] > list_of_tuples[x][1],  # Grouping condition
+            (
+                list_of_tuples[x][1]
+                if list_of_tuples[x][0] > list_of_tuples[x][1]
+                else list_of_tuples[x][0]
+            ),  # First sort key
+            (
+                list_of_tuples[x][0]
+                if list_of_tuples[x][0] > list_of_tuples[x][1]
+                else list_of_tuples[x][1]
+            ),  # Second sort key
+        ),
+    )
+    new_output = {
+        "view1": [],
+        "view2": [],
+        "pred1": [],
+        "pred2": [],
+    }
+    for i in sorted_indices:
+        new_output["view1"].append(output["view1"][i])
+        new_output["view2"].append(output["view2"][i])
+        new_output["pred1"].append(output["pred1"][i])
+        new_output["pred2"].append(output["pred2"][i])
+    output["view1"] = collate_with_cat(new_output["view1"])
+    output["view2"] = collate_with_cat(new_output["view2"])
+    output["pred1"] = collate_with_cat(new_output["pred1"])
+    output["pred2"] = collate_with_cat(new_output["pred2"])
+
+    total_time = time.time() - start_time
+    per_frame_time = total_time / len(views)
+    print(
+        f"Inference completed in {total_time:.2f} seconds (average {per_frame_time:.2f} s per frame)."
+    )
+
+    # Process outputs for visualization.
+    print("Preparing output for visualization...")
+
+    pts3ds_other, colors, conf, cam_dict = prepare_output(
+        output, args.output_dir, device
+    )
+
+    # Convert tensors to numpy arrays for visualization.
+    pts3ds_to_vis = [p.cpu().numpy() for p in pts3ds_other]
+    colors_to_vis = [c.cpu().numpy() for c in colors]
+    edge_colors = [None] * len(pts3ds_to_vis)
+
+    # Create and run the point cloud viewer.
+    print("Launching point cloud viewer...")
+    viewer = PointCloudViewer(
+        model,
+        state_args,
+        pts3ds_to_vis,
+        colors_to_vis,
+        conf,
+        cam_dict,
+        device=device,
+        edge_color_list=edge_colors,
+        show_camera=True,
+        vis_threshold=args.vis_threshold,
+        size=args.size,
+    )
+    viewer.run()
+
+
+def main():
+    args = parse_args()
+    if not args.seq_path:
+        print(
+            "No inputs found! Please use our gradio demo if you would like to iteractively upload inputs."
+        )
+        return
+    else:
+        run_inference(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extern/CUT3R/docs/eval.md b/extern/CUT3R/docs/eval.md
new file mode 100644
index 0000000000000000000000000000000000000000..60122805abad76d6563e21fdad415971fcf17769
--- /dev/null
+++ b/extern/CUT3R/docs/eval.md
@@ -0,0 +1,31 @@
+# Evaluation Scripts
+
+## Monodepth
+
+```bash
+bash eval/monodepth/run.sh
+```
+Results will be saved in `eval_results/monodepth/${data}_${model_name}/metric.json`.
+
+### Video Depth
+
+```bash
+bash eval/video_depth/run.sh # You may need to change [--num_processes] to the number of your gpus
+```
+Results will be saved in `eval_results/video_depth/${data}_${model_name}/result_scale.json`.
+
+### Camera Pose Estimation
+
+```bash
+bash eval/relpose/run.sh # You may need to change [--num_processes] to the number of your gpus
+```
+Results will be saved in `eval_results/relpose/${data}_${model_name}/_error_log.txt`.
+
+### Multi-view Reconstruction
+
+```bash
+bash eval/mv_recon/run.sh # You may need to change [--num_processes] to the number of your gpus
+```
+
+Results will be saved in `eval_results/mv_recon/${model_name}_${ckpt_name}/logs_all.txt`.
+
diff --git a/extern/CUT3R/docs/preprocess.md b/extern/CUT3R/docs/preprocess.md
new file mode 100644
index 0000000000000000000000000000000000000000..33108717412ccb2a7b6c4021a912d68982a95774
--- /dev/null
+++ b/extern/CUT3R/docs/preprocess.md
@@ -0,0 +1,816 @@
+# Preprocess Scripts
+
+Please download all datasets from their <strong>original sources</strong>, except for vKITTI, for which we provide a fully processed version—no need to download the original dataset. For MapFree and DL3DV, we also release depth maps computed using COLMAP Multi-View Stereo (MVS). See the sections below for details on the processing of each dataset. <strong>Please ensure compliance with the respective licensing agreements when downloading.</strong> The total data takes about 25TB of disk space.
+
+> If you encounter issues in the scripts, please feel free to create an issue.
+
+- [ARKitScenes](#arkitscenes) 
+- [BlendedMVS](#blendedmvs)
+- [CO3Dv2](#co3d)
+- [MegaDepth](#megadepth)
+- [ScanNet++](#scannet-1) 
+- [ScanNet](#scannet)
+- [WayMo Open dataset](#waymo)
+- [WildRGB-D](#wildrgbd)
+- [Map-free](#mapfree)
+- [TartanAir](#tartanair)
+- [UnrealStereo4K](#unrealstereo4k) 
+- [Virtual KITTI 2](#virtual-kitti-2)
+- [3D Ken Burns](#3d-ken-burns)
+- [BEDLAM](#bedlam)
+- [COP3D](#cop3d)
+- [DL3DV](#dl3dv)
+- [Dynamic Replica](#dynamic-replica)
+- [EDEN](#eden)
+- [Hypersim](#hypersim)
+- [IRS](#irs)
+- [Matterport3D](#matterport3d)
+- [MVImgNet](#mvimgnet)
+- [MVS-Synth](#mvs-synth)
+- [OmniObject3D](#omniobject3d)
+- [PointOdyssey](#pointodyssey)
+- [RealEstate10K](#realestate10k)
+- [SmartPortraits](#smartportraits)
+- [Spring](#spring)
+- [Synscapes](#synscapes)
+- [UASOL](#uasol)
+- [UrbanSyn](#urbansyn)
+- [HOI4D](#hoi4d)
+
+## [ARKitScenes](https://github.com/apple/ARKitScenes)
+<div>
+<img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+First download the pre-computed pairs provided by [DUSt3R](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/arkitscenes_pairs.zip).
+
+Then run the following command,
+```
+python preprocess_arkitscenes.py --arkitscenes_dir /path/to/your/raw/data --precomputed_pairs /path/to/your/pairs --output_dir /path/to/your/outdir
+
+python generate_set_arkitscenes.py --root /path/to/your/outdir --splits Training Test --max_interval 5.0 --num_workers 8
+```
+
+## [ARKitScenes_highres](https://github.com/apple/ARKitScenes)
+<div>
+<img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+This dataset is a subset of ARKitScenes with high resolution depthmaps.
+
+```
+python preprocess_arkitscenes_highres.py --arkitscenes_dir /path/to/your/raw/data --output_dir /path/to/your/outdir
+```
+
+## [BlendedMVS](https://github.com/YoYo000/BlendedMVS)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/METRIC-red
+" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+Follow DUSt3R to generate the processed BlendedMVS data:
+```
+python preprocess_blendedmvs.py --blendedmvs_dir /path/to/your/raw/data --precomputed_pairs /path/to/your/pairs --output_dir /path/to/your/outdir
+```
+Then put our [overlap set](https://drive.google.com/file/d/1anBQhF9BgOvgaWgAwWnf70tzspQZHBBB/view?usp=sharing) under `/path/to/your/outdir`.
+
+## [CO3D](https://github.com/facebookresearch/co3d)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/METRIC-red
+" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+Follow DUSt3R to generate the processed CO3D data.
+```
+python3 preprocess_co3d.py --co3d_dir /path/to/your/raw/data --output_dir /path/to/your/outdir
+```
+
+## [MegaDepth](https://www.cs.cornell.edu/projects/megadepth/)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/METRIC-red
+" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+First download our [precomputed set](https://drive.google.com/file/d/1bU1VqRu1NdW-4J4BQybqQS64mUG1EtF1/view?usp=sharing) under `/path/to/your/outdir`.
+
+Then run
+```
+python preprocess_megadepth.py --megadepth_dir /path/to/your/raw/data --precomputed_sets /path/to/precomputed_sets  --output_dir /path/to/your/outdir
+```
+
+## [Scannet](http://www.scan-net.org/ScanNet/)
+<div>
+<img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python preprocess_scannet.py --scannet_dir /path/to/your/raw/data  --output_dir /path/to/your/outdir
+
+python generate_set_scannet.py --root /path/to/your/outdir \
+        --splits scans_test scans_train --max_interval 150 --num_workers 8
+```
+
+## [Scannet++](https://kaldir.vc.in.tum.de/scannetpp/)
+<div>
+<img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+First download the pre-computed pairs provided by [DUSt3R](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/scannetpp_pairs.zip).
+
+Then run the following command,
+```
+python preprocess_scannetpp.py --scannetpp_dir /path/to/your/raw/data --precomputed_pairs /path/to/your/pairs --output_dir /path/to/your/outdir
+
+python generate_set_scannetpp.py --root /path/to/your/outdir \
+        --max_interval 150 --num_workers 8
+```
+
+## [Waymo](https://github.com/waymo-research/waymo-open-dataset)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+Follow DUSt3R to generate the processed Waymo data.
+```
+python3 preprocess_waymo.py --waymo_dir /path/to/your/raw/data -precomputed_pairs /path/to/precomputed_pairs --output_dir /path/to/your/outdir
+```
+Then download our [invalid_files](https://drive.google.com/file/d/1xI2SHHoXw1Bm7Lqrn7v56x30stCNuhlv/view?usp=sharing) and put it under `/path/to/your/outdir`.
+
+## [WildRGBD](https://github.com/wildrgbd/wildrgbd/)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>-->
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> 
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+Follow DUSt3R to generate the processed WildRGBD data.
+```
+python3 preprocess_wildrgbd.py --wildrgbd_dir /path/to/your/raw/data --output_dir  /path/to/your/outdir
+```
+
+## [Mapfree](https://research.nianticlabs.com/mapfree-reloc-benchmark/dataset)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>-->
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!--<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+First preprocess the colmap results provided by Mapfree:
+```
+python3 preprocess_mapfree.py --mapfree_dir /path/to/train/data --colmap_dir /path/to/colmap/data --output_dir  /path/to/first/outdir
+```
+
+Then re-organize the data structure:
+```
+python3 preprocess_mapfree2.py --mapfree_dir /path/to/first/outdir --output_dir  /path/to/final/outdir
+```
+
+Finally, download our released [depths and masks](https://drive.google.com/file/d/1gJGEAV5e08CR6nK2gH9i71a7_WJ4ANwc/view?usp=drive_link) and combine it with your `/path/to/final/outdir`.
+```
+rsync -av --update /path/to/our/release /path/to/final/outdir
+```
+
+## [TartanAir](https://theairlab.org/tartanair-dataset/)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!--<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python3 preprocess_tartanair.py --tartanair_dir /path/to/your/raw/data --output_dir  /path/to/your/outdir
+```
+
+## [UnrealStereo4K](https://github.com/fabiotosi92/SMD-Nets)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!--<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python3 preprocess_unreal4k.py --unreal4k_dir /path/to/your/raw/data --output_dir  /path/to/your/outdir
+```
+
+## [Virtual KITTI 2](https://europe.naverlabs.com/research/computer-vision/proxy-virtual-worlds-vkitti-2/)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!--<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> 
+<!-- <img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+As Virtual KITTI 2 is using CC BY-NC-SA 3.0 License, we directly release our [preprocessed data](https://drive.google.com/file/d/1KdAH4ztRkzss1HCkGrPjQNnMg5c-f3aD/view?usp=sharing).
+
+## [3D Ken Burns](https://github.com/sniklaus/3d-ken-burns.git)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/METRIC-red
+" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/>
+<br>
+</div>
+
+```
+python preprocess_3dkb.py --root /path/to/data_3d_ken_burns \
+                           --out_dir /path/to/processed_3dkb \
+                           [--num_workers 4] [--seed 42]
+```
+
+## [BEDLAM](https://bedlam.is.tue.mpg.de/)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python preprocess_bedlam.py --root /path/to/extracted_data \
+                             --outdir /path/to/processed_bedlam \
+                             [--num_workers 4]
+```
+
+## [COP3D](https://github.com/facebookresearch/cop3d)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>-->
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> 
+<!-- <img src="https://img.shields.io/badge/METRIC-red
+" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python3 preprocess_cop3d.py --cop3d_dir /path/to/cop3d \
+       --output_dir /path/to/processed_cop3d
+```
+
+## [DL3DV](https://github.com/DL3DV-10K/Dataset)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/METRIC-red
+" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+
+~~Due to current potential problems with license, you may need to run multi-view stereo on DL3DV by yourself (which is extremely time consuming). If this is done, then you can use our preprocess script:~~
+
+~~```~~
+~~python3 preprocess_dl3dv.py --dl3dv_dir /path/to/dl3dv \ 
+       --output_dir /path/to/processed_dl3dv~~
+~~```~~
+
+**Update: We've released the full version of our processed DL3DV dataset!**
+
+To use our processed DL3DV data, please ensure that you **first** cite the [original DL3DV work](https://github.com/DL3DV-10K/Dataset) and adhere to their licensing terms.
+
+You can then download the following components:
+
+- [RGB images and camera parameters](https://huggingface.co/datasets/zhangify/CUT3R_release/tree/main/processed_dl3dv_ours_rgb_cam_1)
+
+- [Depthmaps and masks](https://drive.google.com/file/d/14E15EG5NJgWH5UVYubrPSSFXmReCIe7f/view?usp=drive_link)
+
+After downloading, merge the components using the provided script:
+```
+python3 merge_dl3dv.py # remember to change necessary paths
+```
+
+## [Dynamic Replica](https://github.com/facebookresearch/dynamic_stereo)
+<div>
+<img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python preprocess_dynamic_replica.py --root_dir /path/to/data_dynamic_replica \
+                                           --out_dir /path/to/processed_dynamic_replica
+```
+
+## [EDEN](https://lhoangan.github.io/eden/)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!--<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/>
+<br>
+</div>
+
+```
+python preprocess_eden.py --root /path/to/data_raw_videos/data_eden \
+                              --out_dir /path/to/data_raw_videos/processed_eden \
+                              [--num_workers N]
+```
+
+## [Hypersim](https://github.com/apple/ml-hypersim)
+<div>
+<img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python preprocess_hypersim.py --hypersim_dir /path/to/hypersim \
+                                  --output_dir /path/to/processed_hypersim
+```
+
+## [IRS](https://github.com/HKBU-HPML/IRS)
+<div>
+<img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/>
+<br>
+</div>
+
+```
+python preprocess_irs.py
+       --root_dir /path/to/data_irs 
+       --out_dir /path/to/processed_irs
+```
+
+## [Matterport3D](https://niessner.github.io/Matterport/)
+<div>
+<img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python preprocess_mp3d.py --root_dir /path/to/data_mp3d/v1/scans \
+                              --out_dir /path/to/processed_mp3d
+```
+
+## [MVImgNet](https://github.com/GAP-LAB-CUHK-SZ/MVImgNet)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/METRIC-red
+" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<!--<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python preprocess_mvimgnet.py --data_dir /path/to/MVImgNet_data \
+                                --pcd_dir /path/to/MVPNet \
+                                --output_dir /path/to/processed_mvimgnet
+```
+
+## [MVS-Synth](https://phuang17.github.io/DeepMVS/mvs-synth.html)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>-->
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python preprocess_mvs_synth.py --root_dir /path/to/data_mvs_synth/GTAV_720/ \
+                                   --out_dir /path/to/processed_mvs_synth \
+                                   --num_workers 32
+```
+
+## [OmniObject3D](https://omniobject3d.github.io/)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python preprocess_omniobject3d.py --input_dir /path/to/input_root --output_dir /path/to/output_root
+```
+
+## [PointOdyssey](https://pointodyssey.com/)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> 
+<!-- <img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python preprocess_point_odyssey.py --input_dir /path/to/input_dataset --output_dir /path/to/output_dataset
+```
+
+## [RealEstate10K](https://google.github.io/realestate10k/)
+<div>
+<img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/METRIC-red
+" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python preprocess_re10k.py --root_dir /path/to/train \
+                             --info_dir /path/to/RealEstate10K/train \
+                             --out_dir /path/to/processed_re10k
+```
+
+## [SmartPortraits](https://mobileroboticsskoltech.github.io/SmartPortraits/)
+<div>
+<img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>-->
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> 
+<br>
+</div>
+
+You need to follow the [official processing pipeline](https://github.com/MobileRoboticsSkoltech/SmartPortraits-toolkit) first. Replace the `convert_to_TUM/utils/convert_to_tum.py` with our `datasets_preprocess/custom_convert2TUM.py` (You may need to change the input path and output path).
+
+Then run
+```
+python preprocess_smartportraits.py \
+        --input_dir /path/to/official/pipeline/output \
+        --output_dir /path/to/processed_smartportraits
+```
+
+## [Spring](https://spring-benchmark.org/)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<!--<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python preprocess_spring.py \
+        --root_dir /path/to/spring/train \
+        --out_dir /path/to/processed_spring \
+        --baseline 0.065 \
+        --output_size 960 540
+```
+
+## [Synscapes](https://synscapes.on.liu.se/)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>-->
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python preprocess_synscapes.py \
+        --synscapes_dir /path/to/Synscapes/Synscapes \
+        --output_dir /path/to/processed_synscapes
+```
+
+## [UASOL](https://osf.io/64532/)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>-->
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- 
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<br>
+</div>
+
+```
+python preprocess_uasol.py \
+        --input_dir /path/to/data_uasol \
+        --output_dir /path/to/processed_uasol
+```
+
+## [UrbanSyn](https://www.urbansyn.org/)
+<div>
+<!-- <img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>-->
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/>-->
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/> 
+<br>
+</div>
+
+```
+python preprocess_urbansyn.py \
+        --input_dir /path/to/data_urbansyn \
+        --output_dir /path/to/processed_urbansyn
+```
+
+## [HOI4D](https://hoi4d.github.io/)
+<div>
+<img src="https://img.shields.io/badge/INDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/MIXED-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/OUTDOOR-blue" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/ObjectCentric-blue" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/METRIC-red" style="display: inline-block; vertical-align: middle;"/>
+<img src="https://img.shields.io/badge/RealWorld-orange" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Synthetic-orange" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/Dynamic-green" style="display: inline-block; vertical-align: middle;"/>
+<!-- <img src="https://img.shields.io/badge/Static-green" style="display: inline-block; vertical-align: middle;"/> -->
+<!-- <img src="https://img.shields.io/badge/CameraOnly-crimson" style="display: inline-block; vertical-align: middle;"/> -->
+<img src="https://img.shields.io/badge/SingleView-crimson" style="display: inline-block; vertical-align: middle;"/>
+<br>
+</div>
+
+```
+python preprocess_hoi4d.py \
+    --root_dir /path/to/HOI4D_release \
+    --cam_root /path/to/camera_params \
+    --out_dir /path/to/processed_hoi4d
+```
diff --git a/extern/CUT3R/docs/train.md b/extern/CUT3R/docs/train.md
new file mode 100644
index 0000000000000000000000000000000000000000..a587b8a9486bc66786ee6ac00479462d8c39515a
--- /dev/null
+++ b/extern/CUT3R/docs/train.md
@@ -0,0 +1,47 @@
+# Training
+
+Please note that this is an academic project, and due to resource constraints, we trained our model iteratively while exploring different configurations. As a result, releasing the complete training procedure is challenging. However, if you wish to train the model from scratch, we provide a set of configurations below that we believe are representative. For fine-tuning, we recommend starting with the scripts available [here](#fine-tuning). There are many design choices to consider, particularly under varying computational constraints, and we look forward to seeing the community explore these possibilities further.
+
+## Training Configurations
+
+You could refer to the following commands as a starting point if you would like to train from scratch.
+
+```
+# Remember to replace the dataset path to your own path
+# the script has been tested on a 8xA100(80G) machine
+
+cd src/
+
+# stage 1, train 224+linear model on static datasets
+CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=TRACE TORCH_DISTRIBUTED_DEBUG=DETAIL HYDRA_FULL_ERROR=1 accelerate launch --multi_gpu train.py  --config-name stage1
+
+# stage 2, finetune 224+linear model on all datasets
+CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=TRACE TORCH_DISTRIBUTED_DEBUG=DETAIL HYDRA_FULL_ERROR=1 accelerate launch --multi_gpu train.py  --config-name stage2
+
+# stage 3, train 512+dpt model on all datasets
+CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=TRACE TORCH_DISTRIBUTED_DEBUG=DETAIL HYDRA_FULL_ERROR=1 accelerate launch --multi_gpu train.py  --config-name stage3
+
+# stage 4, train 512+dpt model on long sequences (32 views)
+CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=TRACE TORCH_DISTRIBUTED_DEBUG=DETAIL HYDRA_FULL_ERROR=1 accelerate launch --multi_gpu train.py  --config-name stage4
+
+# Finally, finetune 512+dpt model on 4-64 views
+CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=TRACE TORCH_DISTRIBUTED_DEBUG=DETAIL HYDRA_FULL_ERROR=1 accelerate launch --multi_gpu train.py  --config-name dpt_512_vary_4_64
+
+```
+
+## Fine-tuning
+
+To fine-tune the released checkpoints, you can use the two provided config files as a starting point. Note that these configs correspond to the final stage of training, where the goal is to train the model to handle <strong>long sequences</strong>. Therefore, in these configs, the encoders are frozen, and single-view datasets are removed. You may adjust the configurations as needed to suit your requirements.
+
+```
+# Remember to replace the dataset path to your own path
+# the script has been tested on a 8xA100(80G) machine
+
+cd src/
+
+# finetune 512 checkpoint
+CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=TRACE TORCH_DISTRIBUTED_DEBUG=DETAIL HYDRA_FULL_ERROR=1 accelerate launch --multi_gpu train.py  --config-name dpt_512_vary_4_64
+
+# finetune 224 checkpoint
+CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=TRACE TORCH_DISTRIBUTED_DEBUG=DETAIL HYDRA_FULL_ERROR=1 accelerate launch --multi_gpu train.py  --config-name linear_224_fixed_16
+```
\ No newline at end of file
diff --git a/extern/CUT3R/eval/monodepth/eval_metrics.py b/extern/CUT3R/eval/monodepth/eval_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..81a59325d4551ed86fc3c3f42e74d06fa06b328a
--- /dev/null
+++ b/extern/CUT3R/eval/monodepth/eval_metrics.py
@@ -0,0 +1,211 @@
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+from eval.monodepth.tools import depth_evaluation
+import numpy as np
+import json
+from tqdm import tqdm
+import glob
+import cv2
+from eval.monodepth.metadata import dataset_metadata
+import argparse
+from PIL import Image
+
+TAG_FLOAT = 202021.25
+
+
+def depth_read_sintel(filename):
+    """Read depth data from file, return as numpy array."""
+    f = open(filename, "rb")
+    check = np.fromfile(f, dtype=np.float32, count=1)[0]
+    assert (
+        check == TAG_FLOAT
+    ), " depth_read:: Wrong tag in flow file (should be: {0}, is: {1}). Big-endian machine? ".format(
+        TAG_FLOAT, check
+    )
+    width = np.fromfile(f, dtype=np.int32, count=1)[0]
+    height = np.fromfile(f, dtype=np.int32, count=1)[0]
+    size = width * height
+    assert (
+        width > 0 and height > 0 and size > 1 and size < 100000000
+    ), " depth_read:: Wrong input size (width = {0}, height = {1}).".format(
+        width, height
+    )
+    depth = np.fromfile(f, dtype=np.float32, count=-1).reshape((height, width))
+    return depth
+
+
+def depth_read_bonn(filename):
+    # loads depth map D from png file
+    # and returns it as a numpy array
+    depth_png = np.asarray(Image.open(filename))
+    # make sure we have a proper 16bit depth map here.. not 8bit!
+    assert np.max(depth_png) > 255
+    depth = depth_png.astype(np.float64) / 5000.0
+    depth[depth_png == 0] = -1.0
+    return depth
+
+
+def depth_read_kitti(filename):
+    # loads depth map D from png file
+    # and returns it as a numpy array,
+    # for details see readme.txt
+    img_pil = Image.open(filename)
+    depth_png = np.array(img_pil, dtype=int)
+    # make sure we have a proper 16bit depth map here.. not 8bit!
+    assert np.max(depth_png) > 255
+
+    depth = depth_png.astype(float) / 256.0
+    depth[depth_png == 0] = -1.0
+    return depth
+
+
+def get_gt_depth(filename, dataset):
+    if dataset == "sintel":
+        return depth_read_sintel(filename)
+    elif dataset == "bonn":
+        return depth_read_bonn(filename)
+    elif dataset == "kitti":
+        return depth_read_kitti(filename)
+    elif dataset == "nyu":
+        return np.load(filename)
+    else:
+        raise NotImplementedError
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="",
+        help="value for outdir",
+    )
+    parser.add_argument(
+        "--eval_dataset", type=str, default="nyu", choices=list(dataset_metadata.keys())
+    )
+    return parser
+
+
+def main(args):
+    if args.eval_dataset == "nyu":
+        depth_pathes = glob.glob("data/nyu-v2/val/nyu_depths/*.npy")
+        depth_pathes = sorted(depth_pathes)
+        pred_pathes = glob.glob(
+            f"{args.output_dir}/*.npy"
+        )  # TODO: update the path to your prediction
+        pred_pathes = sorted(pred_pathes)
+    elif args.eval_dataset == "sintel":
+        pred_pathes = glob.glob(
+            f"{args.output_dir}/*/*.npy"
+        )  # TODO: update the path to your prediction
+        pred_pathes = sorted(pred_pathes)
+        full = len(pred_pathes) > 643
+        if full:
+            depth_pathes = glob.glob(f"data/sintel/training/depth/*/*.dpt")
+            depth_pathes = sorted(depth_pathes)
+        else:
+            seq_list = [
+                "alley_2",
+                "ambush_4",
+                "ambush_5",
+                "ambush_6",
+                "cave_2",
+                "cave_4",
+                "market_2",
+                "market_5",
+                "market_6",
+                "shaman_3",
+                "sleeping_1",
+                "sleeping_2",
+                "temple_2",
+                "temple_3",
+            ]
+            depth_pathes_folder = [
+                f"data/sintel/training/depth/{seq}" for seq in seq_list
+            ]
+            depth_pathes = []
+            for depth_pathes_folder_i in depth_pathes_folder:
+                depth_pathes += glob.glob(depth_pathes_folder_i + "/*.dpt")
+            depth_pathes = sorted(depth_pathes)
+    elif args.eval_dataset == "bonn":
+        seq_list = ["balloon2", "crowd2", "crowd3", "person_tracking2", "synchronous"]
+        img_pathes_folder = [
+            f"data/bonn/rgbd_bonn_dataset/rgbd_bonn_{seq}/rgb_110/*.png"
+            for seq in seq_list
+        ]
+        img_pathes = []
+        for img_pathes_folder_i in img_pathes_folder:
+            img_pathes += glob.glob(img_pathes_folder_i)
+        img_pathes = sorted(img_pathes)
+        depth_pathes_folder = [
+            f"data/bonn/rgbd_bonn_dataset/rgbd_bonn_{seq}/depth_110/*.png"
+            for seq in seq_list
+        ]
+        depth_pathes = []
+        for depth_pathes_folder_i in depth_pathes_folder:
+            depth_pathes += glob.glob(depth_pathes_folder_i)
+        depth_pathes = sorted(depth_pathes)
+        pred_pathes = glob.glob(
+            f"{args.output_dir}/*/*.npy"
+        )  # TODO: update the path to your prediction
+        pred_pathes = sorted(pred_pathes)
+    elif args.eval_dataset == "kitti":
+        depth_pathes = glob.glob(
+            "data/kitti/depth_selection/val_selection_cropped/groundtruth_depth_gathered/*/*.png"
+        )
+        depth_pathes = sorted(depth_pathes)
+        pred_pathes = glob.glob(
+            f"{args.output_dir}/*/*depth.npy"
+        )  # TODO: update the path to your prediction
+        pred_pathes = sorted(pred_pathes)
+    else:
+        raise NotImplementedError
+
+    gathered_depth_metrics = []
+    for idx in tqdm(range(len(depth_pathes))):
+        pred_depth = np.load(pred_pathes[idx])
+        gt_depth = get_gt_depth(depth_pathes[idx], args.eval_dataset)
+        pred_depth = cv2.resize(
+            pred_depth,
+            (gt_depth.shape[1], gt_depth.shape[0]),
+            interpolation=cv2.INTER_CUBIC,
+        )
+        if args.eval_dataset == "nyu":
+            depth_results, error_map, depth_predict, depth_gt = depth_evaluation(
+                pred_depth, gt_depth, max_depth=None, lr=1e-3
+            )
+        elif args.eval_dataset == "sintel":
+            depth_results, error_map, depth_predict, depth_gt = depth_evaluation(
+                pred_depth, gt_depth, max_depth=70, use_gpu=True, post_clip_max=70
+            )
+        elif args.eval_dataset == "bonn":
+            depth_results, error_map, depth_predict, depth_gt = depth_evaluation(
+                pred_depth, gt_depth, max_depth=70, use_gpu=True
+            )
+        elif args.eval_dataset == "kitti":
+            depth_results, error_map, depth_predict, depth_gt = depth_evaluation(
+                pred_depth, gt_depth, max_depth=None, use_gpu=True
+            )
+        gathered_depth_metrics.append(depth_results)
+
+    depth_log_path = os.path.join(args.output_dir, "metric.json")
+    average_metrics = {
+        key: np.average(
+            [metrics[key] for metrics in gathered_depth_metrics],
+            weights=[metrics["valid_pixels"] for metrics in gathered_depth_metrics],
+        )
+        for key in gathered_depth_metrics[0].keys()
+        if key != "valid_pixels"
+    }
+    print(f"{args.eval_dataset} - Average depth evaluation metrics:", average_metrics)
+    with open(depth_log_path, "w") as f:
+        f.write(json.dumps(average_metrics))
+
+
+if __name__ == "__main__":
+    args = get_args_parser()
+    args = args.parse_args()
+    main(args)
diff --git a/extern/CUT3R/eval/monodepth/launch.py b/extern/CUT3R/eval/monodepth/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..5016a63211aac5fdbb3563ccc5c841c3752966fc
--- /dev/null
+++ b/extern/CUT3R/eval/monodepth/launch.py
@@ -0,0 +1,133 @@
+import torch
+import numpy as np
+import cv2
+import glob
+import argparse
+from pathlib import Path
+from tqdm import tqdm
+from copy import deepcopy
+from scipy.optimize import minimize
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+from collections import defaultdict
+from eval.monodepth.metadata import dataset_metadata
+from add_ckpt_path import add_path_to_dust3r
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--weights", type=str, help="path to the model weights", default=""
+    )
+
+    parser.add_argument("--device", type=str, default="cuda", help="pytorch device")
+    parser.add_argument("--output_dir", type=str, default="", help="value for outdir")
+    parser.add_argument(
+        "--no_crop", type=bool, default=True, help="whether to crop input data"
+    )
+    parser.add_argument(
+        "--full_seq", type=bool, default=False, help="whether to use all seqs"
+    )
+    parser.add_argument("--seq_list", default=None)
+
+    parser.add_argument(
+        "--eval_dataset", type=str, default="nyu", choices=list(dataset_metadata.keys())
+    )
+    return parser
+
+
+def eval_mono_depth_estimation(args, model, device):
+    metadata = dataset_metadata.get(args.eval_dataset)
+    if metadata is None:
+        raise ValueError(f"Unknown dataset: {args.eval_dataset}")
+
+    img_path = metadata.get("img_path")
+    if "img_path_func" in metadata:
+        img_path = metadata["img_path_func"](args)
+
+    process_func = metadata.get("process_func")
+    if process_func is None:
+        raise ValueError(
+            f"No processing function defined for dataset: {args.eval_dataset}"
+        )
+
+    for filelist, save_dir in process_func(args, img_path):
+        Path(save_dir).mkdir(parents=True, exist_ok=True)
+        eval_mono_depth(args, model, device, filelist, save_dir=save_dir)
+
+
+def eval_mono_depth(args, model, device, filelist, save_dir=None):
+    model.eval()
+    load_img_size = 512
+    for file in tqdm(filelist):
+        # construct the "image pair" for the single image
+        file = [file]
+        images = load_images(
+            file, size=load_img_size, verbose=False, crop=not args.no_crop
+        )
+        views = []
+        num_views = len(images)
+
+        for i in range(num_views):
+            view = {
+                "img": images[i]["img"],
+                "ray_map": torch.full(
+                    (
+                        images[i]["img"].shape[0],
+                        6,
+                        images[i]["img"].shape[-2],
+                        images[i]["img"].shape[-1],
+                    ),
+                    torch.nan,
+                ),
+                "true_shape": torch.from_numpy(images[i]["true_shape"]),
+                "idx": i,
+                "instance": str(i),
+                "camera_pose": torch.from_numpy(np.eye(4).astype(np.float32)).unsqueeze(
+                    0
+                ),
+                "img_mask": torch.tensor(True).unsqueeze(0),
+                "ray_mask": torch.tensor(False).unsqueeze(0),
+                "update": torch.tensor(True).unsqueeze(0),
+                "reset": torch.tensor(False).unsqueeze(0),
+            }
+            views.append(view)
+
+        outputs, state_args = inference(views, model, device)
+        pts3ds_self = [output["pts3d_in_self_view"].cpu() for output in outputs["pred"]]
+        depth_map = pts3ds_self[0][..., -1].mean(dim=0)
+
+        if save_dir is not None:
+            # save the depth map to the save_dir as npy
+            np.save(
+                f"{save_dir}/{file[0].split('/')[-1].replace('.png','depth.npy')}",
+                depth_map.cpu().numpy(),
+            )
+            # also save the png
+            depth_map = (depth_map - depth_map.min()) / (
+                depth_map.max() - depth_map.min()
+            )
+            depth_map = (depth_map * 255).cpu().numpy().astype(np.uint8)
+            cv2.imwrite(
+                f"{save_dir}/{file[0].split('/')[-1].replace('.png','depth.png')}",
+                depth_map,
+            )
+
+
+if __name__ == "__main__":
+    args = get_args_parser()
+    args = args.parse_args()
+    if args.eval_dataset == "sintel":
+        args.full_seq = True
+    else:
+        args.full_seq = False
+    add_path_to_dust3r(args.weights)
+    from dust3r.utils.image import load_images_for_eval as load_images
+    from dust3r.inference import inference
+    from dust3r.model import ARCroco3DStereo
+
+    model = ARCroco3DStereo.from_pretrained(args.weights).to(args.device)
+    eval_mono_depth_estimation(args, model, args.device)
diff --git a/extern/CUT3R/eval/monodepth/metadata.py b/extern/CUT3R/eval/monodepth/metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..459511277cff7cc319780dc665bb2fdba7acbe9d
--- /dev/null
+++ b/extern/CUT3R/eval/monodepth/metadata.py
@@ -0,0 +1,187 @@
+import os
+import glob
+from tqdm import tqdm
+
+# Define the merged dataset metadata dictionary
+dataset_metadata = {
+    "sun_rgbd": {
+        "img_path": "data/sun_rgbd/image/test",
+        "mask_path": None,
+    },
+    "davis": {
+        "img_path": "data/davis/DAVIS/JPEGImages/480p",
+        "mask_path": "data/davis/DAVIS/masked_images/480p",
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq),
+        "gt_traj_func": lambda img_path, anno_path, seq: None,
+        "traj_format": None,
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: os.path.join(mask_path, seq),
+        "skip_condition": None,
+        "process_func": None,  # Not used in mono depth estimation
+    },
+    "kitti": {
+        "img_path": "data/kitti/depth_selection/val_selection_cropped/image_gathered",  # Default path
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq),
+        "gt_traj_func": lambda img_path, anno_path, seq: None,
+        "traj_format": None,
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": lambda args, img_path: process_kitti(args, img_path),
+    },
+    "bonn": {
+        "img_path": "data/bonn/rgbd_bonn_dataset",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(
+            img_path, f"rgbd_bonn_{seq}", "rgb_110"
+        ),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, f"rgbd_bonn_{seq}", "groundtruth_110.txt"
+        ),
+        "traj_format": "tum",
+        "seq_list": ["balloon2", "crowd2", "crowd3", "person_tracking2", "synchronous"],
+        "full_seq": False,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": lambda args, img_path: process_bonn(args, img_path),
+    },
+    "nyu": {
+        "img_path": "data/nyu-v2/val/nyu_images",
+        "mask_path": None,
+        "process_func": lambda args, img_path: process_nyu(args, img_path),
+    },
+    "scannet": {
+        "img_path": "data/scannetv2",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq, "color_90"),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, seq, "pose_90.txt"
+        ),
+        "traj_format": "replica",
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,  # lambda save_dir, seq: os.path.exists(os.path.join(save_dir, seq)),
+        "process_func": lambda args, img_path: process_scannet(args, img_path),
+    },
+    "tum": {
+        "img_path": "data/tum",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq, "rgb_90"),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, seq, "groundtruth_90.txt"
+        ),
+        "traj_format": "tum",
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": None,
+    },
+    "sintel": {
+        "img_path": "data/sintel/training/final",
+        "anno_path": "data/sintel/training/camdata_left",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(anno_path, seq),
+        "traj_format": None,
+        "seq_list": [
+            "alley_2",
+            "ambush_4",
+            "ambush_5",
+            "ambush_6",
+            "cave_2",
+            "cave_4",
+            "market_2",
+            "market_5",
+            "market_6",
+            "shaman_3",
+            "sleeping_1",
+            "sleeping_2",
+            "temple_2",
+            "temple_3",
+        ],
+        "full_seq": False,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": lambda args, img_path: process_sintel(args, img_path),
+    },
+}
+
+
+# Define processing functions for each dataset
+def process_kitti(args, img_path):
+    for dir in tqdm(sorted(glob.glob(f"{img_path}/*"))):
+        filelist = sorted(glob.glob(f"{dir}/*.png"))
+        save_dir = f"{args.output_dir}/{os.path.basename(dir)}"
+        yield filelist, save_dir
+
+
+def process_bonn(args, img_path):
+    if args.full_seq:
+        for dir in tqdm(sorted(glob.glob(f"{img_path}/*/"))):
+            filelist = sorted(glob.glob(f"{dir}/rgb/*.png"))
+            save_dir = f"{args.output_dir}/{os.path.basename(os.path.dirname(dir))}"
+            yield filelist, save_dir
+    else:
+        seq_list = (
+            ["balloon2", "crowd2", "crowd3", "person_tracking2", "synchronous"]
+            if args.seq_list is None
+            else args.seq_list
+        )
+        for seq in tqdm(seq_list):
+            filelist = sorted(glob.glob(f"{img_path}/rgbd_bonn_{seq}/rgb_110/*.png"))
+            save_dir = f"{args.output_dir}/{seq}"
+            yield filelist, save_dir
+
+
+def process_sunrgbd(args, img_path):
+    filelist = sorted(glob.glob(f"{img_path}/*.jpg"))
+    save_dir = f"{args.output_dir}"
+    yield filelist, save_dir
+
+
+def process_nyu(args, img_path):
+    filelist = sorted(glob.glob(f"{img_path}/*.png"))
+    save_dir = f"{args.output_dir}"
+    yield filelist, save_dir
+
+
+def process_scannet(args, img_path):
+    seq_list = sorted(glob.glob(f"{img_path}/*"))
+    for seq in tqdm(seq_list):
+        filelist = sorted(glob.glob(f"{seq}/color_90/*.jpg"))
+        save_dir = f"{args.output_dir}/{os.path.basename(seq)}"
+        yield filelist, save_dir
+
+
+def process_sintel(args, img_path):
+    if args.full_seq:
+        for dir in tqdm(sorted(glob.glob(f"{img_path}/*/"))):
+            filelist = sorted(glob.glob(f"{dir}/*.png"))
+            save_dir = f"{args.output_dir}/{os.path.basename(os.path.dirname(dir))}"
+            yield filelist, save_dir
+    else:
+        seq_list = [
+            "alley_2",
+            "ambush_4",
+            "ambush_5",
+            "ambush_6",
+            "cave_2",
+            "cave_4",
+            "market_2",
+            "market_5",
+            "market_6",
+            "shaman_3",
+            "sleeping_1",
+            "sleeping_2",
+            "temple_2",
+            "temple_3",
+        ]
+        for seq in tqdm(seq_list):
+            filelist = sorted(glob.glob(f"{img_path}/{seq}/*.png"))
+            save_dir = f"{args.output_dir}/{seq}"
+            yield filelist, save_dir
diff --git a/extern/CUT3R/eval/monodepth/run.sh b/extern/CUT3R/eval/monodepth/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5e442116b94e80e7bb832ae7d433493cebe4137d
--- /dev/null
+++ b/extern/CUT3R/eval/monodepth/run.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+set -e
+
+workdir='.'
+model_name='ours'
+ckpt_name='cut3r_512_dpt_4_64'
+model_weights="${workdir}/src/${ckpt_name}.pth"
+datasets=('sintel' 'bonn' 'kitti' 'nyu')
+
+for data in "${datasets[@]}"; do
+    output_dir="${workdir}/eval_results/monodepth/${data}_${model_name}"
+    echo "$output_dir"
+    python eval/monodepth/launch.py \
+        --weights "$model_weights" \
+        --output_dir "$output_dir" \
+        --eval_dataset "$data"
+done
+
+for data in "${datasets[@]}"; do
+    output_dir="${workdir}/eval_results/monodepth/${data}_${model_name}"
+    python eval/monodepth/eval_metrics.py \
+        --output_dir "$output_dir" \
+        --eval_dataset "$data"
+done
+
diff --git a/extern/CUT3R/eval/monodepth/tools.py b/extern/CUT3R/eval/monodepth/tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6786fa6f25def0110ce22dbb7d44a7a08c952c8
--- /dev/null
+++ b/extern/CUT3R/eval/monodepth/tools.py
@@ -0,0 +1,399 @@
+import torch
+import numpy as np
+import cv2
+import glob
+import argparse
+from pathlib import Path
+from tqdm import tqdm
+from copy import deepcopy
+from scipy.optimize import minimize
+import os
+from collections import defaultdict
+
+
+def group_by_directory(pathes, idx=-1):
+    """
+    Groups the file paths based on the second-to-last directory in their paths.
+
+    Parameters:
+    - pathes (list): List of file paths.
+
+    Returns:
+    - dict: A dictionary where keys are the second-to-last directory names and values are lists of file paths.
+    """
+    grouped_pathes = defaultdict(list)
+
+    for path in pathes:
+        # Extract the second-to-last directory
+        dir_name = os.path.dirname(path).split("/")[idx]
+        grouped_pathes[dir_name].append(path)
+
+    return grouped_pathes
+
+
+def depth2disparity(depth, return_mask=False):
+    if isinstance(depth, torch.Tensor):
+        disparity = torch.zeros_like(depth)
+    elif isinstance(depth, np.ndarray):
+        disparity = np.zeros_like(depth)
+    non_negtive_mask = depth > 0
+    disparity[non_negtive_mask] = 1.0 / depth[non_negtive_mask]
+    if return_mask:
+        return disparity, non_negtive_mask
+    else:
+        return disparity
+
+
+def absolute_error_loss(params, predicted_depth, ground_truth_depth):
+    s, t = params
+
+    predicted_aligned = s * predicted_depth + t
+
+    abs_error = np.abs(predicted_aligned - ground_truth_depth)
+    return np.sum(abs_error)
+
+
+def absolute_value_scaling(predicted_depth, ground_truth_depth, s=1, t=0):
+    predicted_depth_np = predicted_depth.cpu().numpy().reshape(-1)
+    ground_truth_depth_np = ground_truth_depth.cpu().numpy().reshape(-1)
+
+    initial_params = [s, t]  # s = 1, t = 0
+
+    result = minimize(
+        absolute_error_loss,
+        initial_params,
+        args=(predicted_depth_np, ground_truth_depth_np),
+    )
+
+    s, t = result.x
+    return s, t
+
+
+def absolute_value_scaling2(
+    predicted_depth,
+    ground_truth_depth,
+    s_init=1.0,
+    t_init=0.0,
+    lr=1e-4,
+    max_iters=1000,
+    tol=1e-6,
+):
+    # Initialize s and t as torch tensors with requires_grad=True
+    s = torch.tensor(
+        [s_init],
+        requires_grad=True,
+        device=predicted_depth.device,
+        dtype=predicted_depth.dtype,
+    )
+    t = torch.tensor(
+        [t_init],
+        requires_grad=True,
+        device=predicted_depth.device,
+        dtype=predicted_depth.dtype,
+    )
+
+    optimizer = torch.optim.Adam([s, t], lr=lr)
+
+    prev_loss = None
+
+    for i in range(max_iters):
+        optimizer.zero_grad()
+
+        # Compute predicted aligned depth
+        predicted_aligned = s * predicted_depth + t
+
+        # Compute absolute error
+        abs_error = torch.abs(predicted_aligned - ground_truth_depth)
+
+        # Compute loss
+        loss = torch.sum(abs_error)
+
+        # Backpropagate
+        loss.backward()
+
+        # Update parameters
+        optimizer.step()
+
+        # Check convergence
+        if prev_loss is not None and torch.abs(prev_loss - loss) < tol:
+            break
+
+        prev_loss = loss.item()
+
+    return s.detach().item(), t.detach().item()
+
+
+def depth_evaluation(
+    predicted_depth_original,
+    ground_truth_depth_original,
+    max_depth=80,
+    custom_mask=None,
+    post_clip_min=None,
+    post_clip_max=None,
+    pre_clip_min=None,
+    pre_clip_max=None,
+    align_with_lstsq=False,
+    align_with_lad=False,
+    align_with_lad2=False,
+    metric_scale=False,
+    lr=1e-4,
+    max_iters=1000,
+    use_gpu=False,
+    align_with_scale=False,
+    disp_input=False,
+):
+    """
+    Evaluate the depth map using various metrics and return a depth error parity map, with an option for least squares alignment.
+
+    Args:
+        predicted_depth (numpy.ndarray or torch.Tensor): The predicted depth map.
+        ground_truth_depth (numpy.ndarray or torch.Tensor): The ground truth depth map.
+        max_depth (float): The maximum depth value to consider. Default is 80 meters.
+        align_with_lstsq (bool): If True, perform least squares alignment of the predicted depth with ground truth.
+
+    Returns:
+        dict: A dictionary containing the evaluation metrics.
+        torch.Tensor: The depth error parity map.
+    """
+    if isinstance(predicted_depth_original, np.ndarray):
+        predicted_depth_original = torch.from_numpy(predicted_depth_original)
+    if isinstance(ground_truth_depth_original, np.ndarray):
+        ground_truth_depth_original = torch.from_numpy(ground_truth_depth_original)
+    if custom_mask is not None and isinstance(custom_mask, np.ndarray):
+        custom_mask = torch.from_numpy(custom_mask)
+
+    # if the dimension is 3, flatten to 2d along the batch dimension
+    if predicted_depth_original.dim() == 3:
+        _, h, w = predicted_depth_original.shape
+        predicted_depth_original = predicted_depth_original.view(-1, w)
+        ground_truth_depth_original = ground_truth_depth_original.view(-1, w)
+        if custom_mask is not None:
+            custom_mask = custom_mask.view(-1, w)
+
+    # put to device
+    if use_gpu:
+        predicted_depth_original = predicted_depth_original.cuda()
+        ground_truth_depth_original = ground_truth_depth_original.cuda()
+
+    # Filter out depths greater than max_depth
+    if max_depth is not None:
+        mask = (ground_truth_depth_original > 0) & (
+            ground_truth_depth_original < max_depth
+        )
+    else:
+        mask = ground_truth_depth_original > 0
+    predicted_depth = predicted_depth_original[mask]
+    ground_truth_depth = ground_truth_depth_original[mask]
+
+    # Clip the depth values
+    if pre_clip_min is not None:
+        predicted_depth = torch.clamp(predicted_depth, min=pre_clip_min)
+    if pre_clip_max is not None:
+        predicted_depth = torch.clamp(predicted_depth, max=pre_clip_max)
+
+    if disp_input:  # align the pred to gt in the disparity space
+        real_gt = ground_truth_depth.clone()
+        ground_truth_depth = 1 / (ground_truth_depth + 1e-8)
+
+    # various alignment methods
+    if metric_scale:
+        predicted_depth = predicted_depth
+    elif align_with_lstsq:
+        # Convert to numpy for lstsq
+        predicted_depth_np = predicted_depth.cpu().numpy().reshape(-1, 1)
+        ground_truth_depth_np = ground_truth_depth.cpu().numpy().reshape(-1, 1)
+
+        # Add a column of ones for the shift term
+        A = np.hstack([predicted_depth_np, np.ones_like(predicted_depth_np)])
+
+        # Solve for scale (s) and shift (t) using least squares
+        result = np.linalg.lstsq(A, ground_truth_depth_np, rcond=None)
+        s, t = result[0][0], result[0][1]
+
+        # convert to torch tensor
+        s = torch.tensor(s, device=predicted_depth_original.device)
+        t = torch.tensor(t, device=predicted_depth_original.device)
+
+        # Apply scale and shift
+        predicted_depth = s * predicted_depth + t
+    elif align_with_lad:
+        s, t = absolute_value_scaling(
+            predicted_depth,
+            ground_truth_depth,
+            s=torch.median(ground_truth_depth) / torch.median(predicted_depth),
+        )
+        predicted_depth = s * predicted_depth + t
+    elif align_with_lad2:
+        s_init = (
+            torch.median(ground_truth_depth) / torch.median(predicted_depth)
+        ).item()
+        s, t = absolute_value_scaling2(
+            predicted_depth,
+            ground_truth_depth,
+            s_init=s_init,
+            lr=lr,
+            max_iters=max_iters,
+        )
+        predicted_depth = s * predicted_depth + t
+    elif align_with_scale:
+        # Compute initial scale factor 's' using the closed-form solution (L2 norm)
+        dot_pred_gt = torch.nanmean(ground_truth_depth)
+        dot_pred_pred = torch.nanmean(predicted_depth)
+        s = dot_pred_gt / dot_pred_pred
+
+        # Iterative reweighted least squares using the Weiszfeld method
+        for _ in range(10):
+            # Compute residuals between scaled predictions and ground truth
+            residuals = s * predicted_depth - ground_truth_depth
+            abs_residuals = (
+                residuals.abs() + 1e-8
+            )  # Add small constant to avoid division by zero
+
+            # Compute weights inversely proportional to the residuals
+            weights = 1.0 / abs_residuals
+
+            # Update 's' using weighted sums
+            weighted_dot_pred_gt = torch.sum(
+                weights * predicted_depth * ground_truth_depth
+            )
+            weighted_dot_pred_pred = torch.sum(weights * predicted_depth**2)
+            s = weighted_dot_pred_gt / weighted_dot_pred_pred
+
+        # Optionally clip 's' to prevent extreme scaling
+        s = s.clamp(min=1e-3)
+
+        # Detach 's' if you want to stop gradients from flowing through it
+        s = s.detach()
+
+        # Apply the scale factor to the predicted depth
+        predicted_depth = s * predicted_depth
+
+    else:
+        # Align the predicted depth with the ground truth using median scaling
+        scale_factor = torch.median(ground_truth_depth) / torch.median(predicted_depth)
+        predicted_depth *= scale_factor
+
+    if disp_input:
+        # convert back to depth
+        ground_truth_depth = real_gt
+        predicted_depth = depth2disparity(predicted_depth)
+
+    # Clip the predicted depth values
+    if post_clip_min is not None:
+        predicted_depth = torch.clamp(predicted_depth, min=post_clip_min)
+    if post_clip_max is not None:
+        predicted_depth = torch.clamp(predicted_depth, max=post_clip_max)
+
+    if custom_mask is not None:
+        assert custom_mask.shape == ground_truth_depth_original.shape
+        mask_within_mask = custom_mask.cpu()[mask]
+        predicted_depth = predicted_depth[mask_within_mask]
+        ground_truth_depth = ground_truth_depth[mask_within_mask]
+
+    # Calculate the metrics
+    abs_rel = torch.mean(
+        torch.abs(predicted_depth - ground_truth_depth) / ground_truth_depth
+    ).item()
+    sq_rel = torch.mean(
+        ((predicted_depth - ground_truth_depth) ** 2) / ground_truth_depth
+    ).item()
+
+    # Correct RMSE calculation
+    rmse = torch.sqrt(torch.mean((predicted_depth - ground_truth_depth) ** 2)).item()
+
+    # Clip the depth values to avoid log(0)
+    predicted_depth = torch.clamp(predicted_depth, min=1e-5)
+    log_rmse = torch.sqrt(
+        torch.mean((torch.log(predicted_depth) - torch.log(ground_truth_depth)) ** 2)
+    ).item()
+
+    # Calculate the accuracy thresholds
+    max_ratio = torch.maximum(
+        predicted_depth / ground_truth_depth, ground_truth_depth / predicted_depth
+    )
+    threshold_0 = torch.mean((max_ratio < 1.0).float()).item()
+    threshold_1 = torch.mean((max_ratio < 1.25).float()).item()
+    threshold_2 = torch.mean((max_ratio < 1.25**2).float()).item()
+    threshold_3 = torch.mean((max_ratio < 1.25**3).float()).item()
+
+    # Compute the depth error parity map
+    if metric_scale:
+        predicted_depth_original = predicted_depth_original
+        if disp_input:
+            predicted_depth_original = depth2disparity(predicted_depth_original)
+        depth_error_parity_map = (
+            torch.abs(predicted_depth_original - ground_truth_depth_original)
+            / ground_truth_depth_original
+        )
+    elif align_with_lstsq or align_with_lad or align_with_lad2:
+        predicted_depth_original = predicted_depth_original * s + t
+        if disp_input:
+            predicted_depth_original = depth2disparity(predicted_depth_original)
+        depth_error_parity_map = (
+            torch.abs(predicted_depth_original - ground_truth_depth_original)
+            / ground_truth_depth_original
+        )
+    elif align_with_scale:
+        predicted_depth_original = predicted_depth_original * s
+        if disp_input:
+            predicted_depth_original = depth2disparity(predicted_depth_original)
+        depth_error_parity_map = (
+            torch.abs(predicted_depth_original - ground_truth_depth_original)
+            / ground_truth_depth_original
+        )
+    else:
+        predicted_depth_original = predicted_depth_original * scale_factor
+        if disp_input:
+            predicted_depth_original = depth2disparity(predicted_depth_original)
+        depth_error_parity_map = (
+            torch.abs(predicted_depth_original - ground_truth_depth_original)
+            / ground_truth_depth_original
+        )
+
+    # Reshape the depth_error_parity_map back to the original image size
+    depth_error_parity_map_full = torch.zeros_like(ground_truth_depth_original)
+    depth_error_parity_map_full = torch.where(
+        mask, depth_error_parity_map, depth_error_parity_map_full
+    )
+
+    predict_depth_map_full = predicted_depth_original
+    gt_depth_map_full = torch.zeros_like(ground_truth_depth_original)
+    gt_depth_map_full = torch.where(
+        mask, ground_truth_depth_original, gt_depth_map_full
+    )
+
+    num_valid_pixels = (
+        torch.sum(mask).item()
+        if custom_mask is None
+        else torch.sum(mask_within_mask).item()
+    )
+    if num_valid_pixels == 0:
+        (
+            abs_rel,
+            sq_rel,
+            rmse,
+            log_rmse,
+            threshold_0,
+            threshold_1,
+            threshold_2,
+            threshold_3,
+        ) = (0, 0, 0, 0, 0, 0, 0, 0)
+
+    results = {
+        "Abs Rel": abs_rel,
+        "Sq Rel": sq_rel,
+        "RMSE": rmse,
+        "Log RMSE": log_rmse,
+        "δ < 1.": threshold_0,
+        "δ < 1.25": threshold_1,
+        "δ < 1.25^2": threshold_2,
+        "δ < 1.25^3": threshold_3,
+        "valid_pixels": num_valid_pixels,
+    }
+
+    return (
+        results,
+        depth_error_parity_map_full,
+        predict_depth_map_full,
+        gt_depth_map_full,
+    )
diff --git a/extern/CUT3R/eval/mv_recon/base.py b/extern/CUT3R/eval/mv_recon/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..422259107e977098d8fba07ec76d3b50e1006d2c
--- /dev/null
+++ b/extern/CUT3R/eval/mv_recon/base.py
@@ -0,0 +1,273 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# base class for implementing datasets
+# --------------------------------------------------------
+import PIL
+import numpy as np
+import torch
+
+from eval.mv_recon.dataset_utils.transforms import ImgNorm
+from dust3r.utils.geometry import depthmap_to_absolute_camera_coordinates
+import eval.mv_recon.dataset_utils.cropping as cropping
+
+
+class BaseStereoViewDataset:
+    """Define all basic options.
+
+    Usage:
+        class MyDataset (BaseStereoViewDataset):
+            def _get_views(self, idx, rng):
+                # overload here
+                views = []
+                views.append(dict(img=, ...))
+                return views
+    """
+
+    def __init__(
+        self,
+        *,  # only keyword arguments
+        split=None,
+        resolution=None,  # square_size or (width, height) or list of [(width,height), ...]
+        transform=ImgNorm,
+        aug_crop=False,
+        seed=None,
+    ):
+        self.num_views = 2
+        self.split = split
+        self._set_resolutions(resolution)
+
+        self.transform = transform
+        if isinstance(transform, str):
+            transform = eval(transform)
+
+        self.aug_crop = aug_crop
+        self.seed = seed
+
+    def __len__(self):
+        return len(self.scenes)
+
+    def get_stats(self):
+        return f"{len(self)} pairs"
+
+    def __repr__(self):
+        resolutions_str = "[" + ";".join(f"{w}x{h}" for w, h in self._resolutions) + "]"
+        return (
+            f"""{type(self).__name__}({self.get_stats()},
+            {self.split=},
+            {self.seed=},
+            resolutions={resolutions_str},
+            {self.transform=})""".replace(
+                "self.", ""
+            )
+            .replace("\n", "")
+            .replace("   ", "")
+        )
+
+    def _get_views(self, idx, resolution, rng):
+        raise NotImplementedError()
+
+    def __getitem__(self, idx):
+        if isinstance(idx, tuple):
+            # the idx is specifying the aspect-ratio
+            idx, ar_idx = idx
+        else:
+            assert len(self._resolutions) == 1
+            ar_idx = 0
+
+        # set-up the rng
+        if self.seed:  # reseed for each __getitem__
+            self._rng = np.random.default_rng(seed=self.seed + idx)
+        elif not hasattr(self, "_rng"):
+            seed = torch.initial_seed()  # this is different for each dataloader process
+            self._rng = np.random.default_rng(seed=seed)
+
+        # over-loaded code
+        resolution = self._resolutions[
+            ar_idx
+        ]  # DO NOT CHANGE THIS (compatible with BatchedRandomSampler)
+        views = self._get_views(idx, resolution, self._rng)
+
+        # check data-types
+        for v, view in enumerate(views):
+            assert (
+                "pts3d" not in view
+            ), f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}"
+            view["idx"] = v
+
+            # encode the image
+            width, height = view["img"].size
+            view["true_shape"] = np.int32((height, width))
+            view["img"] = self.transform(view["img"])
+
+            assert "camera_intrinsics" in view
+            if "camera_pose" not in view:
+                view["camera_pose"] = np.full((4, 4), np.nan, dtype=np.float32)
+            else:
+                assert np.isfinite(
+                    view["camera_pose"]
+                ).all(), f"NaN in camera pose for view {view_name(view)}"
+            assert "pts3d" not in view
+            assert "valid_mask" not in view
+            assert np.isfinite(
+                view["depthmap"]
+            ).all(), f"NaN in depthmap for view {view_name(view)}"
+            pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view)
+
+            view["pts3d"] = pts3d
+            view["valid_mask"] = valid_mask & np.isfinite(pts3d).all(axis=-1)
+
+            # check all datatypes
+            for key, val in view.items():
+                res, err_msg = is_good_type(key, val)
+                assert res, f"{err_msg} with {key}={val} for view {view_name(view)}"
+            K = view["camera_intrinsics"]
+            view["img_mask"] = True
+            view["ray_mask"] = False
+            view["ray_map"] = torch.full(
+                (6, view["img"].shape[-2], view["img"].shape[-1]), torch.nan
+            )
+            view["update"] = True
+            view["reset"] = False
+
+        # last thing done!
+        for view in views:
+            # transpose to make sure all views are the same size
+            transpose_to_landscape(view)
+            # this allows to check whether the RNG is is the same state each time
+            view["rng"] = int.from_bytes(self._rng.bytes(4), "big")
+        return views
+
+    def _set_resolutions(self, resolutions):
+        """Set the resolution(s) of the dataset.
+        Params:
+            - resolutions: int or tuple or list of tuples
+        """
+        assert resolutions is not None, "undefined resolution"
+
+        if not isinstance(resolutions, list):
+            resolutions = [resolutions]
+
+        self._resolutions = []
+        for resolution in resolutions:
+            if isinstance(resolution, int):
+                width = height = resolution
+            else:
+                width, height = resolution
+            assert isinstance(
+                width, int
+            ), f"Bad type for {width=} {type(width)=}, should be int"
+            assert isinstance(
+                height, int
+            ), f"Bad type for {height=} {type(height)=}, should be int"
+            assert width >= height
+            self._resolutions.append((width, height))
+
+    def _crop_resize_if_necessary(
+        self, image, depthmap, intrinsics, resolution, rng=None, info=None
+    ):
+        """This function:
+        - first downsizes the image with LANCZOS inteprolation,
+          which is better than bilinear interpolation in
+        """
+        if not isinstance(image, PIL.Image.Image):
+            image = PIL.Image.fromarray(image)
+
+        # downscale with lanczos interpolation so that image.size == resolution
+        # cropping centered on the principal point
+        W, H = image.size
+        cx, cy = intrinsics[:2, 2].round().astype(int)
+
+        # calculate min distance to margin
+        min_margin_x = min(cx, W - cx)
+        min_margin_y = min(cy, H - cy)
+        assert min_margin_x > W / 5, f"Bad principal point in view={info}"
+        assert min_margin_y > H / 5, f"Bad principal point in view={info}"
+
+        ## Center crop
+        # Crop on the principal point, make it always centered
+        # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
+        l, t = cx - min_margin_x, cy - min_margin_y
+        r, b = cx + min_margin_x, cy + min_margin_y
+        crop_bbox = (l, t, r, b)
+
+        image, depthmap, intrinsics = cropping.crop_image_depthmap(
+            image, depthmap, intrinsics, crop_bbox
+        )
+
+        # # transpose the resolution if necessary
+        W, H = image.size  # new size
+        assert resolution[0] >= resolution[1]
+        if H > 1.1 * W:
+            # image is portrait mode
+            resolution = resolution[::-1]
+        elif 0.9 < H / W < 1.1 and resolution[0] != resolution[1]:
+            # image is square, so we chose (portrait, landscape) randomly
+            if rng.integers(2):
+                resolution = resolution[::-1]
+
+        # high-quality Lanczos down-scaling
+        target_resolution = np.array(resolution)
+        # # if self.aug_crop > 1:
+        # #     target_resolution += rng.integers(0, self.aug_crop)
+        # if resolution != (224, 224):
+        #     halfw, halfh = ((2*(W//2))//16)*8, ((2*(H//2))//16)*8
+        #     ## Recale with max factor, so  one of width or height might be larger than target_resolution
+        #     image, depthmap, intrinsics = cropping.rescale_image_depthmap(image, depthmap, intrinsics, (2*halfw, 2*halfh))
+        # else:
+        image, depthmap, intrinsics = cropping.rescale_image_depthmap(
+            image, depthmap, intrinsics, target_resolution
+        )
+        # actual cropping (if necessary) with bilinear interpolation
+        # if resolution == (224, 224):
+        intrinsics2 = cropping.camera_matrix_of_crop(
+            intrinsics, image.size, resolution, offset_factor=0.5
+        )
+        crop_bbox = cropping.bbox_from_intrinsics_in_out(
+            intrinsics, intrinsics2, resolution
+        )
+        image, depthmap, intrinsics = cropping.crop_image_depthmap(
+            image, depthmap, intrinsics, crop_bbox
+        )
+        return image, depthmap, intrinsics
+
+
+def is_good_type(key, v):
+    """returns (is_good, err_msg)"""
+    if isinstance(v, (str, int, tuple)):
+        return True, None
+    if v.dtype not in (np.float32, torch.float32, bool, np.int32, np.int64, np.uint8):
+        return False, f"bad {v.dtype=}"
+    return True, None
+
+
+def view_name(view, batch_index=None):
+    def sel(x):
+        return x[batch_index] if batch_index not in (None, slice(None)) else x
+
+    db = sel(view["dataset"])
+    label = sel(view["label"])
+    instance = sel(view["instance"])
+    return f"{db}/{label}/{instance}"
+
+
+def transpose_to_landscape(view):
+    height, width = view["true_shape"]
+
+    if width < height:
+        # rectify portrait to landscape
+        assert view["img"].shape == (3, height, width)
+        view["img"] = view["img"].swapaxes(1, 2)
+
+        assert view["valid_mask"].shape == (height, width)
+        view["valid_mask"] = view["valid_mask"].swapaxes(0, 1)
+
+        assert view["depthmap"].shape == (height, width)
+        view["depthmap"] = view["depthmap"].swapaxes(0, 1)
+
+        assert view["pts3d"].shape == (height, width, 3)
+        view["pts3d"] = view["pts3d"].swapaxes(0, 1)
+
+        # transpose x and y pixels
+        view["camera_intrinsics"] = view["camera_intrinsics"][[1, 0, 2]]
diff --git a/extern/CUT3R/eval/mv_recon/criterion.py b/extern/CUT3R/eval/mv_recon/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b546dc6e05fe78efd9a45999e8b61fb06d0abe9
--- /dev/null
+++ b/extern/CUT3R/eval/mv_recon/criterion.py
@@ -0,0 +1,537 @@
+import torch
+import torch.nn as nn
+from copy import copy, deepcopy
+from dust3r.utils.misc import invalid_to_zeros, invalid_to_nans
+from dust3r.utils.geometry import inv, geotrf, depthmap_to_pts3d
+from dust3r.utils.camera import pose_encoding_to_camera
+
+
+class BaseCriterion(nn.Module):
+    def __init__(self, reduction="mean"):
+        super().__init__()
+        self.reduction = reduction
+
+
+class Criterion(nn.Module):
+    def __init__(self, criterion=None):
+        super().__init__()
+        assert isinstance(
+            criterion, BaseCriterion
+        ), f"{criterion} is not a proper criterion!"
+        self.criterion = copy(criterion)
+
+    def get_name(self):
+        return f"{type(self).__name__}({self.criterion})"
+
+    def with_reduction(self, mode="none"):
+        res = loss = deepcopy(self)
+        while loss is not None:
+            assert isinstance(loss, Criterion)
+            loss.criterion.reduction = mode  # make it return the loss for each sample
+            loss = loss._loss2  # we assume loss is a Multiloss
+        return res
+
+
+class MultiLoss(nn.Module):
+    """Easily combinable losses (also keep track of individual loss values):
+        loss = MyLoss1() + 0.1*MyLoss2()
+    Usage:
+        Inherit from this class and override get_name() and compute_loss()
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._alpha = 1
+        self._loss2 = None
+
+    def compute_loss(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    def get_name(self):
+        raise NotImplementedError()
+
+    def __mul__(self, alpha):
+        assert isinstance(alpha, (int, float))
+        res = copy(self)
+        res._alpha = alpha
+        return res
+
+    __rmul__ = __mul__  # same
+
+    def __add__(self, loss2):
+        assert isinstance(loss2, MultiLoss)
+        res = cur = copy(self)
+
+        while cur._loss2 is not None:
+            cur = cur._loss2
+        cur._loss2 = loss2
+        return res
+
+    def __repr__(self):
+        name = self.get_name()
+        if self._alpha != 1:
+            name = f"{self._alpha:g}*{name}"
+        if self._loss2:
+            name = f"{name} + {self._loss2}"
+        return name
+
+    def forward(self, *args, **kwargs):
+        loss = self.compute_loss(*args, **kwargs)
+        if isinstance(loss, tuple):
+            loss, details = loss
+        elif loss.ndim == 0:
+            details = {self.get_name(): float(loss)}
+        else:
+            details = {}
+        loss = loss * self._alpha
+
+        if self._loss2:
+            loss2, details2 = self._loss2(*args, **kwargs)
+            loss = loss + loss2
+            details |= details2
+
+        return loss, details
+
+
+class LLoss(BaseCriterion):
+    """L-norm loss"""
+
+    def forward(self, a, b):
+        assert (
+            a.shape == b.shape and a.ndim >= 2 and 1 <= a.shape[-1] <= 3
+        ), f"Bad shape = {a.shape}"
+        dist = self.distance(a, b)
+
+        if self.reduction == "none":
+            return dist
+        if self.reduction == "sum":
+            return dist.sum()
+        if self.reduction == "mean":
+            return dist.mean() if dist.numel() > 0 else dist.new_zeros(())
+        raise ValueError(f"bad {self.reduction=} mode")
+
+    def distance(self, a, b):
+        raise NotImplementedError()
+
+
+class L21Loss(LLoss):
+    """Euclidean distance between 3d points"""
+
+    def distance(self, a, b):
+        return torch.norm(a - b, dim=-1)  # normalized L2 distance
+
+
+L21 = L21Loss()
+
+
+def get_pred_pts3d(gt, pred, use_pose=False):
+    if "depth" in pred and "pseudo_focal" in pred:
+        try:
+            pp = gt["camera_intrinsics"][..., :2, 2]
+        except KeyError:
+            pp = None
+        pts3d = depthmap_to_pts3d(**pred, pp=pp)
+
+    elif "pts3d" in pred:
+        # pts3d from my camera
+        pts3d = pred["pts3d"]
+
+    elif "pts3d_in_other_view" in pred:
+        # pts3d from the other camera, already transformed
+        assert use_pose is True
+        return pred["pts3d_in_other_view"]  # return!
+
+    if use_pose:
+        camera_pose = pred.get("camera_pose")
+        pts3d = pred.get("pts3d_in_self_view")
+        assert camera_pose is not None
+        assert pts3d is not None
+        pts3d = geotrf(pose_encoding_to_camera(camera_pose), pts3d)
+
+    return pts3d
+
+
+def Sum(losses, masks, conf=None):
+    loss, mask = losses[0], masks[0]
+    if loss.ndim > 0:
+        # we are actually returning the loss for every pixels
+        if conf is not None:
+            return losses, masks, conf
+        return losses, masks
+    else:
+        # we are returning the global loss
+        for loss2 in losses[1:]:
+            loss = loss + loss2
+        return loss
+
+
+def get_norm_factor(pts, norm_mode="avg_dis", valids=None, fix_first=True):
+    assert pts[0].ndim >= 3 and pts[0].shape[-1] == 3
+    assert pts[1] is None or (pts[1].ndim >= 3 and pts[1].shape[-1] == 3)
+    norm_mode, dis_mode = norm_mode.split("_")
+
+    nan_pts = []
+    nnzs = []
+
+    if norm_mode == "avg":
+        # gather all points together (joint normalization)
+
+        for i, pt in enumerate(pts):
+            nan_pt, nnz = invalid_to_zeros(pt, valids[i], ndim=3)
+            nan_pts.append(nan_pt)
+            nnzs.append(nnz)
+
+            if fix_first:
+                break
+        all_pts = torch.cat(nan_pts, dim=1)
+
+        # compute distance to origin
+        all_dis = all_pts.norm(dim=-1)
+        if dis_mode == "dis":
+            pass  # do nothing
+        elif dis_mode == "log1p":
+            all_dis = torch.log1p(all_dis)
+        else:
+            raise ValueError(f"bad {dis_mode=}")
+
+        norm_factor = all_dis.sum(dim=1) / (torch.cat(nnzs).sum() + 1e-8)
+    else:
+        raise ValueError(f"Not implemented {norm_mode=}")
+
+    norm_factor = norm_factor.clip(min=1e-8)
+    while norm_factor.ndim < pts[0].ndim:
+        norm_factor.unsqueeze_(-1)
+
+    return norm_factor
+
+
+def normalize_pointcloud_t(
+    pts, norm_mode="avg_dis", valids=None, fix_first=True, gt=False
+):
+    if gt:
+        norm_factor = get_norm_factor(pts, norm_mode, valids, fix_first)
+        res = []
+
+        for i, pt in enumerate(pts):
+            res.append(pt / norm_factor)
+
+    else:
+        # pts_l, pts_r = pts
+        # use pts_l and pts_r[-1] as pts to normalize
+        norm_factor = get_norm_factor(pts, norm_mode, valids, fix_first)
+
+        res = []
+
+        for i in range(len(pts)):
+            res.append(pts[i] / norm_factor)
+            # res_r.append(pts_r[i] / norm_factor)
+
+        # res = [res_l, res_r]
+
+    return res, norm_factor
+
+
+@torch.no_grad()
+def get_joint_pointcloud_depth(zs, valid_masks=None, quantile=0.5):
+    # set invalid points to NaN
+    _zs = []
+    for i in range(len(zs)):
+        valid_mask = valid_masks[i] if valid_masks is not None else None
+        _z = invalid_to_nans(zs[i], valid_mask).reshape(len(zs[i]), -1)
+        _zs.append(_z)
+
+    _zs = torch.cat(_zs, dim=-1)
+
+    # compute median depth overall (ignoring nans)
+    if quantile == 0.5:
+        shift_z = torch.nanmedian(_zs, dim=-1).values
+    else:
+        shift_z = torch.nanquantile(_zs, quantile, dim=-1)
+    return shift_z  # (B,)
+
+
+@torch.no_grad()
+def get_joint_pointcloud_center_scale(pts, valid_masks=None, z_only=False, center=True):
+    # set invalid points to NaN
+
+    _pts = []
+    for i in range(len(pts)):
+        valid_mask = valid_masks[i] if valid_masks is not None else None
+        _pt = invalid_to_nans(pts[i], valid_mask).reshape(len(pts[i]), -1, 3)
+        _pts.append(_pt)
+
+    _pts = torch.cat(_pts, dim=1)
+
+    # compute median center
+    _center = torch.nanmedian(_pts, dim=1, keepdim=True).values  # (B,1,3)
+    if z_only:
+        _center[..., :2] = 0  # do not center X and Y
+
+    # compute median norm
+    _norm = ((_pts - _center) if center else _pts).norm(dim=-1)
+    scale = torch.nanmedian(_norm, dim=1).values
+    return _center[:, None, :, :], scale[:, None, None, None]
+
+
+class Regr3D_t(Criterion, MultiLoss):
+    def __init__(self, criterion, norm_mode="avg_dis", gt_scale=False, fix_first=True):
+        super().__init__(criterion)
+        self.norm_mode = norm_mode
+        self.gt_scale = gt_scale
+        self.fix_first = fix_first
+
+    def get_all_pts3d_t(self, gts, preds, dist_clip=None):
+        # everything is normalized w.r.t. camera of view1
+        in_camera1 = inv(gts[0]["camera_pose"])
+
+        gt_pts = []
+        valids = []
+        pr_pts = []
+
+        for i, gt in enumerate(gts):
+            # in_camera1: Bs, 4, 4 gt['pts3d']: Bs, H, W, 3
+            gt_pts.append(geotrf(in_camera1, gt["pts3d"]))
+
+            valid = gt["valid_mask"].clone()
+
+            if dist_clip is not None:
+                # points that are too far-away == invalid
+                dis = gt["pts3d"].norm(dim=-1)
+                valid = valid & (dis <= dist_clip)
+
+            valids.append(valid)
+            pr_pts.append(get_pred_pts3d(gt, preds[i], use_pose=True))
+            # if i != len(gts)-1:
+            #     pr_pts_l.append(get_pred_pts3d(gt, preds[i][0], use_pose=(i!=0)))
+
+            # if i != 0:
+            #     pr_pts_r.append(get_pred_pts3d(gt, preds[i-1][1], use_pose=(i!=0)))
+
+        # pr_pts = (pr_pts_l, pr_pts_r)
+
+        if self.norm_mode:
+            pr_pts, pr_factor = normalize_pointcloud_t(
+                pr_pts, self.norm_mode, valids, fix_first=self.fix_first, gt=False
+            )
+        else:
+            pr_factor = None
+
+        if self.norm_mode and not self.gt_scale:
+            gt_pts, gt_factor = normalize_pointcloud_t(
+                gt_pts, self.norm_mode, valids, fix_first=self.fix_first, gt=True
+            )
+        else:
+            gt_factor = None
+
+        return gt_pts, pr_pts, gt_factor, pr_factor, valids, {}
+
+    def compute_frame_loss(self, gts, preds, **kw):
+        gt_pts, pred_pts, gt_factor, pr_factor, masks, monitoring = (
+            self.get_all_pts3d_t(gts, preds, **kw)
+        )
+
+        pred_pts_l, pred_pts_r = pred_pts
+
+        loss_all = []
+        mask_all = []
+        conf_all = []
+
+        loss_left = 0
+        loss_right = 0
+        pred_conf_l = 0
+        pred_conf_r = 0
+
+        for i in range(len(gt_pts)):
+
+            # Left (Reference)
+            if i != len(gt_pts) - 1:
+                frame_loss = self.criterion(
+                    pred_pts_l[i][masks[i]], gt_pts[i][masks[i]]
+                )
+
+                loss_all.append(frame_loss)
+                mask_all.append(masks[i])
+                conf_all.append(preds[i][0]["conf"])
+
+                # To compare target/reference loss
+                if i != 0:
+                    loss_left += frame_loss.cpu().detach().numpy().mean()
+                    pred_conf_l += preds[i][0]["conf"].cpu().detach().numpy().mean()
+
+            # Right (Target)
+            if i != 0:
+                frame_loss = self.criterion(
+                    pred_pts_r[i - 1][masks[i]], gt_pts[i][masks[i]]
+                )
+
+                loss_all.append(frame_loss)
+                mask_all.append(masks[i])
+                conf_all.append(preds[i - 1][1]["conf"])
+
+                # To compare target/reference loss
+                if i != len(gt_pts) - 1:
+                    loss_right += frame_loss.cpu().detach().numpy().mean()
+                    pred_conf_r += preds[i - 1][1]["conf"].cpu().detach().numpy().mean()
+
+        if pr_factor is not None and gt_factor is not None:
+            filter_factor = pr_factor[pr_factor > gt_factor]
+        else:
+            filter_factor = []
+
+        if len(filter_factor) > 0:
+            factor_loss = (filter_factor - gt_factor).abs().mean()
+        else:
+            factor_loss = 0.0
+
+        self_name = type(self).__name__
+        details = {
+            self_name + "_pts3d_1": float(loss_all[0].mean()),
+            self_name + "_pts3d_2": float(loss_all[1].mean()),
+            self_name + "loss_left": float(loss_left),
+            self_name + "loss_right": float(loss_right),
+            self_name + "conf_left": float(pred_conf_l),
+            self_name + "conf_right": float(pred_conf_r),
+        }
+
+        return Sum(loss_all, mask_all, conf_all), (details | monitoring), factor_loss
+
+
+class ConfLoss_t(MultiLoss):
+    """Weighted regression by learned confidence.
+        Assuming the input pixel_loss is a pixel-level regression loss.
+
+    Principle:
+        high-confidence means high conf = 0.1 ==> conf_loss = x / 10 + alpha*log(10)
+        low  confidence means low  conf = 10  ==> conf_loss = x * 10 - alpha*log(10)
+
+        alpha: hyperparameter
+    """
+
+    def __init__(self, pixel_loss, alpha=1):
+        super().__init__()
+        assert alpha > 0
+        self.alpha = alpha
+        self.pixel_loss = pixel_loss.with_reduction("none")
+
+    def get_name(self):
+        return f"ConfLoss({self.pixel_loss})"
+
+    def get_conf_log(self, x):
+        return x, torch.log(x)
+
+    def compute_frame_loss(self, gts, preds, **kw):
+        # compute per-pixel loss
+        (losses, masks, confs), details, loss_factor = (
+            self.pixel_loss.compute_frame_loss(gts, preds, **kw)
+        )
+
+        # weight by confidence
+        conf_losses = []
+        conf_sum = 0
+        for i in range(len(losses)):
+            conf, log_conf = self.get_conf_log(confs[i][masks[i]])
+            conf_sum += conf.mean()
+            conf_loss = losses[i] * conf - self.alpha * log_conf
+            conf_loss = conf_loss.mean() if conf_loss.numel() > 0 else 0
+            conf_losses.append(conf_loss)
+
+        conf_losses = torch.stack(conf_losses) * 2.0
+        conf_loss_mean = conf_losses.mean()
+
+        return (
+            conf_loss_mean,
+            dict(
+                conf_loss_1=float(conf_losses[0]),
+                conf_loss2=float(conf_losses[1]),
+                conf_mean=conf_sum / len(losses),
+                **details,
+            ),
+            loss_factor,
+        )
+
+
+class Regr3D_t_ShiftInv(Regr3D_t):
+    """Same than Regr3D but invariant to depth shift."""
+
+    def get_all_pts3d_t(self, gts, preds):
+        # compute unnormalized points
+        gt_pts, pred_pts, gt_factor, pr_factor, masks, monitoring = (
+            super().get_all_pts3d_t(gts, preds)
+        )
+
+        # pred_pts_l, pred_pts_r = pred_pts
+        gt_zs = [gt_pt[..., 2] for gt_pt in gt_pts]
+
+        pred_zs = [pred_pt[..., 2] for pred_pt in pred_pts]
+        # pred_zs.append(pred_pts_r[-1][..., 2])
+
+        # compute median depth
+        gt_shift_z = get_joint_pointcloud_depth(gt_zs, masks)[:, None, None]
+        pred_shift_z = get_joint_pointcloud_depth(pred_zs, masks)[:, None, None]
+
+        # subtract the median depth
+        for i in range(len(gt_pts)):
+            gt_pts[i][..., 2] -= gt_shift_z
+
+        for i in range(len(pred_pts)):
+            # for j in range(len(pred_pts[i])):
+            pred_pts[i][..., 2] -= pred_shift_z
+
+        monitoring = dict(
+            monitoring,
+            gt_shift_z=gt_shift_z.mean().detach(),
+            pred_shift_z=pred_shift_z.mean().detach(),
+        )
+        return gt_pts, pred_pts, gt_factor, pr_factor, masks, monitoring
+
+
+class Regr3D_t_ScaleInv(Regr3D_t):
+    """Same than Regr3D but invariant to depth shift.
+    if gt_scale == True: enforce the prediction to take the same scale than GT
+    """
+
+    def get_all_pts3d_t(self, gts, preds):
+        # compute depth-normalized points
+        gt_pts, pred_pts, gt_factor, pr_factor, masks, monitoring = (
+            super().get_all_pts3d_t(gts, preds)
+        )
+
+        # measure scene scale
+
+        # pred_pts_l, pred_pts_r = pred_pts
+
+        pred_pts_all = [
+            x.clone() for x in pred_pts
+        ]  # [pred_pt for pred_pt in pred_pts_l]
+        # pred_pts_all.append(pred_pts_r[-1])
+
+        _, gt_scale = get_joint_pointcloud_center_scale(gt_pts, masks)
+        _, pred_scale = get_joint_pointcloud_center_scale(pred_pts_all, masks)
+
+        # prevent predictions to be in a ridiculous range
+        pred_scale = pred_scale.clip(min=1e-3, max=1e3)
+
+        # subtract the median depth
+        if self.gt_scale:
+            for i in range(len(pred_pts)):
+                # for j in range(len(pred_pts[i])):
+                pred_pts[i] *= gt_scale / pred_scale
+
+        else:
+            for i in range(len(pred_pts)):
+                # for j in range(len(pred_pts[i])):
+                pred_pts[i] *= pred_scale / gt_scale
+
+            for i in range(len(gt_pts)):
+                gt_pts[i] *= gt_scale / pred_scale
+
+        monitoring = dict(
+            monitoring, gt_scale=gt_scale.mean(), pred_scale=pred_scale.mean().detach()
+        )
+
+        return gt_pts, pred_pts, gt_factor, pr_factor, masks, monitoring
+
+
+class Regr3D_t_ScaleShiftInv(Regr3D_t_ScaleInv, Regr3D_t_ShiftInv):
+    # calls Regr3D_ShiftInv first, then Regr3D_ScaleInv
+    pass
diff --git a/extern/CUT3R/eval/mv_recon/data.py b/extern/CUT3R/eval/mv_recon/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..de9925698cb62503c24deb4e79e35705c5f0e6c4
--- /dev/null
+++ b/extern/CUT3R/eval/mv_recon/data.py
@@ -0,0 +1,522 @@
+import os
+import cv2
+import json
+import numpy as np
+import os.path as osp
+from collections import deque
+import random
+from eval.mv_recon.base import BaseStereoViewDataset
+from dust3r.utils.image import imread_cv2
+import eval.mv_recon.dataset_utils.cropping as cropping
+
+
+def shuffle_deque(dq, seed=None):
+    # Set the random seed for reproducibility
+    if seed is not None:
+        random.seed(seed)
+
+    # Convert deque to list, shuffle, and convert back
+    shuffled_list = list(dq)
+    random.shuffle(shuffled_list)
+    return deque(shuffled_list)
+
+
+class SevenScenes(BaseStereoViewDataset):
+    def __init__(
+        self,
+        num_seq=1,
+        num_frames=5,
+        min_thresh=10,
+        max_thresh=100,
+        test_id=None,
+        full_video=False,
+        tuple_list=None,
+        seq_id=None,
+        rebuttal=False,
+        shuffle_seed=-1,
+        kf_every=1,
+        *args,
+        ROOT,
+        **kwargs,
+    ):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        self.num_seq = num_seq
+        self.num_frames = num_frames
+        self.max_thresh = max_thresh
+        self.min_thresh = min_thresh
+        self.test_id = test_id
+        self.full_video = full_video
+        self.kf_every = kf_every
+        self.seq_id = seq_id
+        self.rebuttal = rebuttal
+        self.shuffle_seed = shuffle_seed
+
+        # load all scenes
+        self.load_all_tuples(tuple_list)
+        self.load_all_scenes(ROOT)
+
+    def __len__(self):
+        if self.tuple_list is not None:
+            return len(self.tuple_list)
+        return len(self.scene_list) * self.num_seq
+
+    def load_all_tuples(self, tuple_list):
+        if tuple_list is not None:
+            self.tuple_list = tuple_list
+            # with open(tuple_path) as f:
+            #     self.tuple_list = f.read().splitlines()
+
+        else:
+            self.tuple_list = None
+
+    def load_all_scenes(self, base_dir):
+
+        if self.tuple_list is not None:
+            # Use pre-defined simplerecon scene_ids
+            self.scene_list = [
+                "stairs/seq-06",
+                "stairs/seq-02",
+                "pumpkin/seq-06",
+                "chess/seq-01",
+                "heads/seq-02",
+                "fire/seq-02",
+                "office/seq-03",
+                "pumpkin/seq-03",
+                "redkitchen/seq-07",
+                "chess/seq-02",
+                "office/seq-01",
+                "redkitchen/seq-01",
+                "fire/seq-01",
+            ]
+            print(f"Found {len(self.scene_list)} sequences in split {self.split}")
+            return
+
+        scenes = os.listdir(base_dir)
+
+        file_split = {"train": "TrainSplit.txt", "test": "TestSplit.txt"}[self.split]
+
+        self.scene_list = []
+        for scene in scenes:
+            if self.test_id is not None and scene != self.test_id:
+                continue
+            # read file split
+            with open(osp.join(base_dir, scene, file_split)) as f:
+                seq_ids = f.read().splitlines()
+
+                for seq_id in seq_ids:
+                    # seq is string, take the int part and make it 01, 02, 03
+                    # seq_id = 'seq-{:2d}'.format(int(seq_id))
+                    num_part = "".join(filter(str.isdigit, seq_id))
+                    seq_id = f"seq-{num_part.zfill(2)}"
+                    if self.seq_id is not None and seq_id != self.seq_id:
+                        continue
+                    self.scene_list.append(f"{scene}/{seq_id}")
+
+        print(f"Found {len(self.scene_list)} sequences in split {self.split}")
+
+    def _get_views(self, idx, resolution, rng):
+
+        if self.tuple_list is not None:
+            line = self.tuple_list[idx].split(" ")
+            scene_id = line[0]
+            img_idxs = line[1:]
+
+        else:
+            scene_id = self.scene_list[idx // self.num_seq]
+            seq_id = idx % self.num_seq
+
+            data_path = osp.join(self.ROOT, scene_id)
+            num_files = len([name for name in os.listdir(data_path) if "color" in name])
+            img_idxs = [f"{i:06d}" for i in range(num_files)]
+            img_idxs = img_idxs[:: self.kf_every]
+
+        # Intrinsics used in SimpleRecon
+        fx, fy, cx, cy = 525, 525, 320, 240
+        intrinsics_ = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
+
+        views = []
+        imgs_idxs = deque(img_idxs)
+        if self.shuffle_seed >= 0:
+            imgs_idxs = shuffle_deque(imgs_idxs)
+
+        while len(imgs_idxs) > 0:
+            im_idx = imgs_idxs.popleft()
+            impath = osp.join(self.ROOT, scene_id, f"frame-{im_idx}.color.png")
+            depthpath = osp.join(self.ROOT, scene_id, f"frame-{im_idx}.depth.proj.png")
+            posepath = osp.join(self.ROOT, scene_id, f"frame-{im_idx}.pose.txt")
+
+            rgb_image = imread_cv2(impath)
+            depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED)
+            rgb_image = cv2.resize(rgb_image, (depthmap.shape[1], depthmap.shape[0]))
+
+            depthmap[depthmap == 65535] = 0
+            depthmap = np.nan_to_num(depthmap.astype(np.float32), 0.0) / 1000.0
+            depthmap[depthmap > 10] = 0
+            depthmap[depthmap < 1e-3] = 0
+
+            camera_pose = np.loadtxt(posepath).astype(np.float32)
+
+            if resolution != (224, 224) or self.rebuttal:
+                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                    rgb_image, depthmap, intrinsics_, resolution, rng=rng, info=impath
+                )
+            else:
+                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                    rgb_image, depthmap, intrinsics_, (512, 384), rng=rng, info=impath
+                )
+                W, H = rgb_image.size
+                cx = W // 2
+                cy = H // 2
+                l, t = cx - 112, cy - 112
+                r, b = cx + 112, cy + 112
+                crop_bbox = (l, t, r, b)
+                rgb_image, depthmap, intrinsics = cropping.crop_image_depthmap(
+                    rgb_image, depthmap, intrinsics, crop_bbox
+                )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap,
+                    camera_pose=camera_pose,
+                    camera_intrinsics=intrinsics,
+                    dataset="7scenes",
+                    label=osp.join(scene_id, im_idx),
+                    instance=impath,
+                )
+            )
+        return views
+
+
+class DTU(BaseStereoViewDataset):
+    def __init__(
+        self,
+        num_seq=49,
+        num_frames=5,
+        min_thresh=10,
+        max_thresh=30,
+        test_id=None,
+        full_video=False,
+        sample_pairs=False,
+        kf_every=1,
+        *args,
+        ROOT,
+        **kwargs,
+    ):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+
+        self.num_seq = num_seq
+        self.num_frames = num_frames
+        self.max_thresh = max_thresh
+        self.min_thresh = min_thresh
+        self.test_id = test_id
+        self.full_video = full_video
+        self.kf_every = kf_every
+        self.sample_pairs = sample_pairs
+
+        # load all scenes
+        self.load_all_scenes(ROOT)
+
+    def __len__(self):
+        return len(self.scene_list) * self.num_seq
+
+    def load_all_scenes(self, base_dir):
+
+        if self.test_id is None:
+            self.scene_list = os.listdir(osp.join(base_dir))
+            print(f"Found {len(self.scene_list)} scenes in split {self.split}")
+
+        else:
+            if isinstance(self.test_id, list):
+                self.scene_list = self.test_id
+            else:
+                self.scene_list = [self.test_id]
+
+            print(f"Test_id: {self.test_id}")
+
+    def load_cam_mvsnet(self, file, interval_scale=1):
+        """read camera txt file"""
+        cam = np.zeros((2, 4, 4))
+        words = file.read().split()
+        # read extrinsic
+        for i in range(0, 4):
+            for j in range(0, 4):
+                extrinsic_index = 4 * i + j + 1
+                cam[0][i][j] = words[extrinsic_index]
+
+        # read intrinsic
+        for i in range(0, 3):
+            for j in range(0, 3):
+                intrinsic_index = 3 * i + j + 18
+                cam[1][i][j] = words[intrinsic_index]
+
+        if len(words) == 29:
+            cam[1][3][0] = words[27]
+            cam[1][3][1] = float(words[28]) * interval_scale
+            cam[1][3][2] = 192
+            cam[1][3][3] = cam[1][3][0] + cam[1][3][1] * cam[1][3][2]
+        elif len(words) == 30:
+            cam[1][3][0] = words[27]
+            cam[1][3][1] = float(words[28]) * interval_scale
+            cam[1][3][2] = words[29]
+            cam[1][3][3] = cam[1][3][0] + cam[1][3][1] * cam[1][3][2]
+        elif len(words) == 31:
+            cam[1][3][0] = words[27]
+            cam[1][3][1] = float(words[28]) * interval_scale
+            cam[1][3][2] = words[29]
+            cam[1][3][3] = words[30]
+        else:
+            cam[1][3][0] = 0
+            cam[1][3][1] = 0
+            cam[1][3][2] = 0
+            cam[1][3][3] = 0
+
+        extrinsic = cam[0].astype(np.float32)
+        intrinsic = cam[1].astype(np.float32)
+
+        return intrinsic, extrinsic
+
+    def _get_views(self, idx, resolution, rng):
+        scene_id = self.scene_list[idx // self.num_seq]
+        seq_id = idx % self.num_seq
+
+        print("Scene ID:", scene_id)
+
+        image_path = osp.join(self.ROOT, scene_id, "images")
+        depth_path = osp.join(self.ROOT, scene_id, "depths")
+        mask_path = osp.join(self.ROOT, scene_id, "binary_masks")
+        cam_path = osp.join(self.ROOT, scene_id, "cams")
+        pairs_path = osp.join(self.ROOT, scene_id, "pair.txt")
+
+        if not self.full_video:
+            img_idxs = self.sample_pairs(pairs_path, seq_id)
+        else:
+            img_idxs = sorted(os.listdir(image_path))
+            img_idxs = img_idxs[:: self.kf_every]
+
+        views = []
+        imgs_idxs = deque(img_idxs)
+
+        while len(imgs_idxs) > 0:
+            im_idx = imgs_idxs.pop()
+            impath = osp.join(image_path, im_idx)
+            depthpath = osp.join(depth_path, im_idx.replace(".jpg", ".npy"))
+            campath = osp.join(cam_path, im_idx.replace(".jpg", "_cam.txt"))
+            maskpath = osp.join(mask_path, im_idx.replace(".jpg", ".png"))
+
+            rgb_image = imread_cv2(impath)
+            depthmap = np.load(depthpath)
+            depthmap = np.nan_to_num(depthmap.astype(np.float32), 0.0)
+
+            mask = imread_cv2(maskpath, cv2.IMREAD_UNCHANGED) / 255.0
+            mask = mask.astype(np.float32)
+
+            mask[mask > 0.5] = 1.0
+            mask[mask < 0.5] = 0.0
+
+            mask = cv2.resize(
+                mask,
+                (depthmap.shape[1], depthmap.shape[0]),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            kernel = np.ones((10, 10), np.uint8)  # Define the erosion kernel
+            mask = cv2.erode(mask, kernel, iterations=1)
+            depthmap = depthmap * mask
+
+            cur_intrinsics, camera_pose = self.load_cam_mvsnet(open(campath, "r"))
+            intrinsics = cur_intrinsics[:3, :3]
+            camera_pose = np.linalg.inv(camera_pose)
+
+            if resolution != (224, 224):
+                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                    rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath
+                )
+            else:
+                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                    rgb_image, depthmap, intrinsics, (512, 384), rng=rng, info=impath
+                )
+                W, H = rgb_image.size
+                cx = W // 2
+                cy = H // 2
+                l, t = cx - 112, cy - 112
+                r, b = cx + 112, cy + 112
+                crop_bbox = (l, t, r, b)
+                rgb_image, depthmap, intrinsics = cropping.crop_image_depthmap(
+                    rgb_image, depthmap, intrinsics, crop_bbox
+                )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap,
+                    camera_pose=camera_pose,
+                    camera_intrinsics=intrinsics,
+                    dataset="dtu",
+                    label=osp.join(scene_id, im_idx),
+                    instance=impath,
+                )
+            )
+
+        return views
+
+
+class NRGBD(BaseStereoViewDataset):
+    def __init__(
+        self,
+        num_seq=1,
+        num_frames=5,
+        min_thresh=10,
+        max_thresh=100,
+        test_id=None,
+        full_video=False,
+        tuple_list=None,
+        seq_id=None,
+        rebuttal=False,
+        shuffle_seed=-1,
+        kf_every=1,
+        *args,
+        ROOT,
+        **kwargs,
+    ):
+
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        self.num_seq = num_seq
+        self.num_frames = num_frames
+        self.max_thresh = max_thresh
+        self.min_thresh = min_thresh
+        self.test_id = test_id
+        self.full_video = full_video
+        self.kf_every = kf_every
+        self.seq_id = seq_id
+        self.rebuttal = rebuttal
+        self.shuffle_seed = shuffle_seed
+
+        # load all scenes
+        self.load_all_tuples(tuple_list)
+        self.load_all_scenes(ROOT)
+
+    def __len__(self):
+        if self.tuple_list is not None:
+            return len(self.tuple_list)
+        return len(self.scene_list) * self.num_seq
+
+    def load_all_tuples(self, tuple_list):
+        if tuple_list is not None:
+            self.tuple_list = tuple_list
+            # with open(tuple_path) as f:
+            #     self.tuple_list = f.read().splitlines()
+
+        else:
+            self.tuple_list = None
+
+    def load_all_scenes(self, base_dir):
+
+        scenes = [
+            d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))
+        ]
+
+        if self.test_id is not None:
+            self.scene_list = [self.test_id]
+
+        else:
+            self.scene_list = scenes
+
+        print(f"Found {len(self.scene_list)} sequences in split {self.split}")
+
+    def load_poses(self, path):
+        file = open(path, "r")
+        lines = file.readlines()
+        file.close()
+        poses = []
+        valid = []
+        lines_per_matrix = 4
+        for i in range(0, len(lines), lines_per_matrix):
+            if "nan" in lines[i]:
+                valid.append(False)
+                poses.append(np.eye(4, 4, dtype=np.float32).tolist())
+            else:
+                valid.append(True)
+                pose_floats = [
+                    [float(x) for x in line.split()]
+                    for line in lines[i : i + lines_per_matrix]
+                ]
+                poses.append(pose_floats)
+
+        return np.array(poses, dtype=np.float32), valid
+
+    def _get_views(self, idx, resolution, rng):
+
+        if self.tuple_list is not None:
+            line = self.tuple_list[idx].split(" ")
+            scene_id = line[0]
+            img_idxs = line[1:]
+
+        else:
+            scene_id = self.scene_list[idx // self.num_seq]
+
+            num_files = len(os.listdir(os.path.join(self.ROOT, scene_id, "images")))
+            img_idxs = [f"{i}" for i in range(num_files)]
+            img_idxs = img_idxs[:: min(self.kf_every, len(img_idxs) // 2)]
+
+        fx, fy, cx, cy = 554.2562584220408, 554.2562584220408, 320, 240
+        intrinsics_ = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
+
+        posepath = osp.join(self.ROOT, scene_id, f"poses.txt")
+        camera_poses, valids = self.load_poses(posepath)
+
+        imgs_idxs = deque(img_idxs)
+        if self.shuffle_seed >= 0:
+            imgs_idxs = shuffle_deque(imgs_idxs)
+        views = []
+
+        while len(imgs_idxs) > 0:
+            im_idx = imgs_idxs.popleft()
+
+            impath = osp.join(self.ROOT, scene_id, "images", f"img{im_idx}.png")
+            depthpath = osp.join(self.ROOT, scene_id, "depth", f"depth{im_idx}.png")
+
+            rgb_image = imread_cv2(impath)
+            depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED)
+            depthmap = np.nan_to_num(depthmap.astype(np.float32), 0.0) / 1000.0
+            depthmap[depthmap > 10] = 0
+            depthmap[depthmap < 1e-3] = 0
+
+            rgb_image = cv2.resize(rgb_image, (depthmap.shape[1], depthmap.shape[0]))
+
+            camera_pose = camera_poses[int(im_idx)]
+            # gl to cv
+            camera_pose[:, 1:3] *= -1.0
+            if resolution != (224, 224) or self.rebuttal:
+                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                    rgb_image, depthmap, intrinsics_, resolution, rng=rng, info=impath
+                )
+            else:
+                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                    rgb_image, depthmap, intrinsics_, (512, 384), rng=rng, info=impath
+                )
+                W, H = rgb_image.size
+                cx = W // 2
+                cy = H // 2
+                l, t = cx - 112, cy - 112
+                r, b = cx + 112, cy + 112
+                crop_bbox = (l, t, r, b)
+                rgb_image, depthmap, intrinsics = cropping.crop_image_depthmap(
+                    rgb_image, depthmap, intrinsics, crop_bbox
+                )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap,
+                    camera_pose=camera_pose,
+                    camera_intrinsics=intrinsics,
+                    dataset="nrgbd",
+                    label=osp.join(scene_id, im_idx),
+                    instance=impath,
+                )
+            )
+
+        return views
diff --git a/extern/CUT3R/eval/mv_recon/dataset_utils/__init__.py b/extern/CUT3R/eval/mv_recon/dataset_utils/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/extern/CUT3R/eval/mv_recon/dataset_utils/__init__.py
@@ -0,0 +1 @@
+
diff --git a/extern/CUT3R/eval/mv_recon/dataset_utils/corr.py b/extern/CUT3R/eval/mv_recon/dataset_utils/corr.py
new file mode 100755
index 0000000000000000000000000000000000000000..d39d8fad844c65f0f839de6b728f2ab72b19f6a2
--- /dev/null
+++ b/extern/CUT3R/eval/mv_recon/dataset_utils/corr.py
@@ -0,0 +1,122 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+
+import numpy as np
+from dust3r.utils.device import to_numpy
+from dust3r.utils.geometry import inv, geotrf
+
+
+def reproject_view(pts3d, view2):
+    shape = view2["pts3d"].shape[:2]
+    return reproject(
+        pts3d, view2["camera_intrinsics"], inv(view2["camera_pose"]), shape
+    )
+
+
+def reproject(pts3d, K, world2cam, shape):
+    H, W, THREE = pts3d.shape
+    assert THREE == 3
+
+    with np.errstate(divide="ignore", invalid="ignore"):
+        pos = geotrf(K @ world2cam[:3], pts3d, norm=1, ncol=2)
+
+    return (H, W), ravel_xy(pos, shape)
+
+
+def ravel_xy(pos, shape):
+    H, W = shape
+    with np.errstate(invalid="ignore"):
+        qx, qy = pos.reshape(-1, 2).round().astype(np.int32).T
+    quantized_pos = qx.clip(min=0, max=W - 1, out=qx) + W * qy.clip(
+        min=0, max=H - 1, out=qy
+    )
+    return quantized_pos
+
+
+def unravel_xy(pos, shape):
+
+    return np.unravel_index(pos, shape)[0].base[:, ::-1].copy()
+
+
+def reciprocal_1d(corres_1_to_2, corres_2_to_1, ret_recip=False):
+    is_reciprocal1 = corres_2_to_1[corres_1_to_2] == np.arange(len(corres_1_to_2))
+    pos1 = is_reciprocal1.nonzero()[0]
+    pos2 = corres_1_to_2[pos1]
+    if ret_recip:
+        return is_reciprocal1, pos1, pos2
+    return pos1, pos2
+
+
+def extract_correspondences_from_pts3d(
+    view1, view2, target_n_corres, rng=np.random, ret_xy=True, nneg=0
+):
+    view1, view2 = to_numpy((view1, view2))
+
+    shape1, corres1_to_2 = reproject_view(view1["pts3d"], view2)
+    shape2, corres2_to_1 = reproject_view(view2["pts3d"], view1)
+
+    is_reciprocal1, pos1, pos2 = reciprocal_1d(
+        corres1_to_2, corres2_to_1, ret_recip=True
+    )
+    is_reciprocal2 = corres1_to_2[corres2_to_1] == np.arange(len(corres2_to_1))
+
+    if target_n_corres is None:
+        if ret_xy:
+            pos1 = unravel_xy(pos1, shape1)
+            pos2 = unravel_xy(pos2, shape2)
+        return pos1, pos2
+
+    available_negatives = min((~is_reciprocal1).sum(), (~is_reciprocal2).sum())
+    target_n_positives = int(target_n_corres * (1 - nneg))
+    n_positives = min(len(pos1), target_n_positives)
+    n_negatives = min(target_n_corres - n_positives, available_negatives)
+
+    if n_negatives + n_positives != target_n_corres:
+
+        n_positives = target_n_corres - n_negatives
+        assert n_positives <= len(pos1)
+
+    assert n_positives <= len(pos1)
+    assert n_positives <= len(pos2)
+    assert n_negatives <= (~is_reciprocal1).sum()
+    assert n_negatives <= (~is_reciprocal2).sum()
+    assert n_positives + n_negatives == target_n_corres
+
+    valid = np.ones(n_positives, dtype=bool)
+    if n_positives < len(pos1):
+
+        perm = rng.permutation(len(pos1))[:n_positives]
+        pos1 = pos1[perm]
+        pos2 = pos2[perm]
+
+    if n_negatives > 0:
+
+        def norm(p):
+            return p / p.sum()
+
+        pos1 = np.r_[
+            pos1,
+            rng.choice(
+                shape1[0] * shape1[1],
+                size=n_negatives,
+                replace=False,
+                p=norm(~is_reciprocal1),
+            ),
+        ]
+        pos2 = np.r_[
+            pos2,
+            rng.choice(
+                shape2[0] * shape2[1],
+                size=n_negatives,
+                replace=False,
+                p=norm(~is_reciprocal2),
+            ),
+        ]
+        valid = np.r_[valid, np.zeros(n_negatives, dtype=bool)]
+
+    if ret_xy:
+        pos1 = unravel_xy(pos1, shape1)
+        pos2 = unravel_xy(pos2, shape2)
+    return pos1, pos2, valid
diff --git a/extern/CUT3R/eval/mv_recon/dataset_utils/cropping.py b/extern/CUT3R/eval/mv_recon/dataset_utils/cropping.py
new file mode 100755
index 0000000000000000000000000000000000000000..db1356c2e689337348884674cfc20a1c60902b57
--- /dev/null
+++ b/extern/CUT3R/eval/mv_recon/dataset_utils/cropping.py
@@ -0,0 +1,142 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+
+import PIL.Image
+import os
+
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2  # noqa
+import numpy as np  # noqa
+from dust3r.utils.geometry import (
+    colmap_to_opencv_intrinsics,
+    opencv_to_colmap_intrinsics,
+)  # noqa
+
+try:
+    lanczos = PIL.Image.Resampling.LANCZOS
+    bicubic = PIL.Image.Resampling.BICUBIC
+except AttributeError:
+    lanczos = PIL.Image.LANCZOS
+    bicubic = PIL.Image.BICUBIC
+
+
+class ImageList:
+    """Convenience class to aply the same operation to a whole set of images."""
+
+    def __init__(self, images):
+        if not isinstance(images, (tuple, list, set)):
+            images = [images]
+        self.images = []
+        for image in images:
+            if not isinstance(image, PIL.Image.Image):
+                image = PIL.Image.fromarray(image)
+            self.images.append(image)
+
+    def __len__(self):
+        return len(self.images)
+
+    def to_pil(self):
+        return tuple(self.images) if len(self.images) > 1 else self.images[0]
+
+    @property
+    def size(self):
+        sizes = [im.size for im in self.images]
+        assert all(sizes[0] == s for s in sizes)
+        return sizes[0]
+
+    def resize(self, *args, **kwargs):
+        return ImageList(self._dispatch("resize", *args, **kwargs))
+
+    def crop(self, *args, **kwargs):
+        return ImageList(self._dispatch("crop", *args, **kwargs))
+
+    def _dispatch(self, func, *args, **kwargs):
+        return [getattr(im, func)(*args, **kwargs) for im in self.images]
+
+
+def rescale_image_depthmap(
+    image, depthmap, camera_intrinsics, output_resolution, force=True
+):
+    """Jointly rescale a (image, depthmap)
+    so that (out_width, out_height) >= output_res
+    """
+    image = ImageList(image)
+    input_resolution = np.array(image.size)  # (W,H)
+    output_resolution = np.array(output_resolution)
+    if depthmap is not None:
+
+        assert tuple(depthmap.shape[:2]) == image.size[::-1]
+
+    assert output_resolution.shape == (2,)
+    scale_final = max(output_resolution / image.size) + 1e-8
+    if scale_final >= 1 and not force:  # image is already smaller than what is asked
+        return (image.to_pil(), depthmap, camera_intrinsics)
+    output_resolution = np.floor(input_resolution * scale_final).astype(int)
+
+    image = image.resize(
+        output_resolution, resample=lanczos if scale_final < 1 else bicubic
+    )
+    if depthmap is not None:
+        depthmap = cv2.resize(
+            depthmap,
+            output_resolution,
+            fx=scale_final,
+            fy=scale_final,
+            interpolation=cv2.INTER_NEAREST,
+        )
+
+    camera_intrinsics = camera_matrix_of_crop(
+        camera_intrinsics, input_resolution, output_resolution, scaling=scale_final
+    )
+
+    return image.to_pil(), depthmap, camera_intrinsics
+
+
+def camera_matrix_of_crop(
+    input_camera_matrix,
+    input_resolution,
+    output_resolution,
+    scaling=1,
+    offset_factor=0.5,
+    offset=None,
+):
+
+    margins = np.asarray(input_resolution) * scaling - output_resolution
+    assert np.all(margins >= 0.0)
+    if offset is None:
+        offset = offset_factor * margins
+
+    output_camera_matrix_colmap = opencv_to_colmap_intrinsics(input_camera_matrix)
+    output_camera_matrix_colmap[:2, :] *= scaling
+    output_camera_matrix_colmap[:2, 2] -= offset
+    output_camera_matrix = colmap_to_opencv_intrinsics(output_camera_matrix_colmap)
+
+    return output_camera_matrix
+
+
+def crop_image_depthmap(image, depthmap, camera_intrinsics, crop_bbox):
+    """
+    Return a crop of the input view.
+    """
+    image = ImageList(image)
+    l, t, r, b = crop_bbox
+
+    image = image.crop((l, t, r, b))
+    depthmap = depthmap[t:b, l:r]
+
+    camera_intrinsics = camera_intrinsics.copy()
+    camera_intrinsics[0, 2] -= l
+    camera_intrinsics[1, 2] -= t
+
+    return image.to_pil(), depthmap, camera_intrinsics
+
+
+def bbox_from_intrinsics_in_out(
+    input_camera_matrix, output_camera_matrix, output_resolution
+):
+    out_width, out_height = output_resolution
+    l, t = np.int32(np.round(input_camera_matrix[:2, 2] - output_camera_matrix[:2, 2]))
+    crop_bbox = (l, t, l + out_width, t + out_height)
+    return crop_bbox
diff --git a/extern/CUT3R/eval/mv_recon/dataset_utils/transforms.py b/extern/CUT3R/eval/mv_recon/dataset_utils/transforms.py
new file mode 100755
index 0000000000000000000000000000000000000000..cf858808ac187ce88a9222a7b525b650394de282
--- /dev/null
+++ b/extern/CUT3R/eval/mv_recon/dataset_utils/transforms.py
@@ -0,0 +1,77 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+
+import torchvision.transforms as tvf
+from dust3r.utils.image import ImgNorm
+
+
+ColorJitter = tvf.Compose([tvf.ColorJitter(0.5, 0.5, 0.5, 0.1), ImgNorm])
+
+
+def _check_input(value, center=1, bound=(0, float("inf")), clip_first_on_zero=True):
+    if isinstance(value, (int, float)):
+        if value < 0:
+            raise ValueError(f"If  is a single number, it must be non negative.")
+        value = [center - float(value), center + float(value)]
+        if clip_first_on_zero:
+            value[0] = max(value[0], 0.0)
+    elif isinstance(value, (tuple, list)) and len(value) == 2:
+        value = [float(value[0]), float(value[1])]
+    else:
+        raise TypeError(f"should be a single number or a list/tuple with length 2.")
+
+    if not bound[0] <= value[0] <= value[1] <= bound[1]:
+        raise ValueError(f"values should be between {bound}, but got {value}.")
+
+    if value[0] == value[1] == center:
+        return None
+    else:
+        return tuple(value)
+
+
+import torch
+import torchvision.transforms.functional as F
+
+
+def SeqColorJitter():
+    """
+    Return a color jitter transform with same random parameters
+    """
+    brightness = _check_input(0.5)
+    contrast = _check_input(0.5)
+    saturation = _check_input(0.5)
+    hue = _check_input(0.1, center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
+
+    fn_idx = torch.randperm(4)
+    brightness_factor = (
+        None
+        if brightness is None
+        else float(torch.empty(1).uniform_(brightness[0], brightness[1]))
+    )
+    contrast_factor = (
+        None
+        if contrast is None
+        else float(torch.empty(1).uniform_(contrast[0], contrast[1]))
+    )
+    saturation_factor = (
+        None
+        if saturation is None
+        else float(torch.empty(1).uniform_(saturation[0], saturation[1]))
+    )
+    hue_factor = None if hue is None else float(torch.empty(1).uniform_(hue[0], hue[1]))
+
+    def _color_jitter(img):
+        for fn_id in fn_idx:
+            if fn_id == 0 and brightness_factor is not None:
+                img = F.adjust_brightness(img, brightness_factor)
+            elif fn_id == 1 and contrast_factor is not None:
+                img = F.adjust_contrast(img, contrast_factor)
+            elif fn_id == 2 and saturation_factor is not None:
+                img = F.adjust_saturation(img, saturation_factor)
+            elif fn_id == 3 and hue_factor is not None:
+                img = F.adjust_hue(img, hue_factor)
+        return ImgNorm(img)
+
+    return _color_jitter
diff --git a/extern/CUT3R/eval/mv_recon/launch.py b/extern/CUT3R/eval/mv_recon/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..544e77dbcb512d57afd705482c443d4fca930ce9
--- /dev/null
+++ b/extern/CUT3R/eval/mv_recon/launch.py
@@ -0,0 +1,396 @@
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+import time
+import torch
+import argparse
+import numpy as np
+import open3d as o3d
+import os.path as osp
+from torch.utils.data import DataLoader
+from add_ckpt_path import add_path_to_dust3r
+from accelerate import Accelerator
+from torch.utils.data._utils.collate import default_collate
+import tempfile
+from tqdm import tqdm
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser("3D Reconstruction evaluation", add_help=False)
+    parser.add_argument(
+        "--weights",
+        type=str,
+        default="",
+        help="ckpt name",
+    )
+    parser.add_argument("--device", type=str, default="cuda:0", help="device")
+    parser.add_argument("--model_name", type=str, default="")
+    parser.add_argument(
+        "--conf_thresh", type=float, default=0.0, help="confidence threshold"
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="",
+        help="value for outdir",
+    )
+    parser.add_argument("--size", type=int, default=512)
+    parser.add_argument("--revisit", type=int, default=1, help="revisit times")
+    parser.add_argument("--freeze", action="store_true")
+    return parser
+
+
+def main(args):
+    add_path_to_dust3r(args.weights)
+    from eval.mv_recon.data import SevenScenes, NRGBD
+    from eval.mv_recon.utils import accuracy, completion
+
+    if args.size == 512:
+        resolution = (512, 384)
+    elif args.size == 224:
+        resolution = 224
+    else:
+        raise NotImplementedError
+    datasets_all = {
+        "7scenes": SevenScenes(
+            split="test",
+            ROOT="./data/7scenes",
+            resolution=resolution,
+            num_seq=1,
+            full_video=True,
+            kf_every=200,
+        ),  # 20),
+        "NRGBD": NRGBD(
+            split="test",
+            ROOT="./data/neural_rgbd",
+            resolution=resolution,
+            num_seq=1,
+            full_video=True,
+            kf_every=500,
+        ),
+    }
+
+    accelerator = Accelerator()
+    device = accelerator.device
+    model_name = args.model_name
+    if model_name == "ours" or model_name == "cut3r":
+        from dust3r.model import ARCroco3DStereo
+        from eval.mv_recon.criterion import Regr3D_t_ScaleShiftInv, L21
+        from dust3r.utils.geometry import geotrf
+        from copy import deepcopy
+
+        model = ARCroco3DStereo.from_pretrained(args.weights).to(device)
+        model.eval()
+    else:
+        raise NotImplementedError
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    criterion = Regr3D_t_ScaleShiftInv(L21, norm_mode=False, gt_scale=True)
+
+    with torch.no_grad():
+        for name_data, dataset in datasets_all.items():
+            save_path = osp.join(args.output_dir, name_data)
+            os.makedirs(save_path, exist_ok=True)
+            log_file = osp.join(save_path, f"logs_{accelerator.process_index}.txt")
+
+            acc_all = 0
+            acc_all_med = 0
+            comp_all = 0
+            comp_all_med = 0
+            nc1_all = 0
+            nc1_all_med = 0
+            nc2_all = 0
+            nc2_all_med = 0
+
+            fps_all = []
+            time_all = []
+
+            with accelerator.split_between_processes(list(range(len(dataset)))) as idxs:
+                for data_idx in tqdm(idxs):
+                    batch = default_collate([dataset[data_idx]])
+                    ignore_keys = set(
+                        [
+                            "depthmap",
+                            "dataset",
+                            "label",
+                            "instance",
+                            "idx",
+                            "true_shape",
+                            "rng",
+                        ]
+                    )
+                    for view in batch:
+                        for name in view.keys():  # pseudo_focal
+                            if name in ignore_keys:
+                                continue
+                            if isinstance(view[name], tuple) or isinstance(
+                                view[name], list
+                            ):
+                                view[name] = [
+                                    x.to(device, non_blocking=True) for x in view[name]
+                                ]
+                            else:
+                                view[name] = view[name].to(device, non_blocking=True)
+
+                    if model_name == "ours" or model_name == "cut3r":
+                        revisit = args.revisit
+                        update = not args.freeze
+                        if revisit > 1:
+                            # repeat input for 'revisit' times
+                            new_views = []
+                            for r in range(revisit):
+                                for i in range(len(batch)):
+                                    new_view = deepcopy(batch[i])
+                                    new_view["idx"] = [
+                                        (r * len(batch) + i)
+                                        for _ in range(len(batch[i]["idx"]))
+                                    ]
+                                    new_view["instance"] = [
+                                        str(r * len(batch) + i)
+                                        for _ in range(len(batch[i]["instance"]))
+                                    ]
+                                    if r > 0:
+                                        if not update:
+                                            new_view["update"] = torch.zeros_like(
+                                                batch[i]["update"]
+                                            ).bool()
+                                    new_views.append(new_view)
+                            batch = new_views
+                        with torch.cuda.amp.autocast(enabled=False):
+                            start = time.time()
+                            output = model(batch)
+                            end = time.time()
+                            preds, batch = output.ress, output.views
+                        valid_length = len(preds) // revisit
+                        preds = preds[-valid_length:]
+                        batch = batch[-valid_length:]
+                        fps = len(batch) / (end - start)
+                        print(
+                            f"Finished reconstruction for {name_data} {data_idx+1}/{len(dataset)}, FPS: {fps:.2f}"
+                        )
+                        # continue
+                        fps_all.append(fps)
+                        time_all.append(end - start)
+
+                        # Evaluation
+                        print(f"Evaluation for {name_data} {data_idx+1}/{len(dataset)}")
+                        gt_pts, pred_pts, gt_factor, pr_factor, masks, monitoring = (
+                            criterion.get_all_pts3d_t(batch, preds)
+                        )
+                        pred_scale, gt_scale, pred_shift_z, gt_shift_z = (
+                            monitoring["pred_scale"],
+                            monitoring["gt_scale"],
+                            monitoring["pred_shift_z"],
+                            monitoring["gt_shift_z"],
+                        )
+
+                        in_camera1 = None
+                        pts_all = []
+                        pts_gt_all = []
+                        images_all = []
+                        masks_all = []
+                        conf_all = []
+
+                        for j, view in enumerate(batch):
+                            if in_camera1 is None:
+                                in_camera1 = view["camera_pose"][0].cpu()
+
+                            image = view["img"].permute(0, 2, 3, 1).cpu().numpy()[0]
+                            mask = view["valid_mask"].cpu().numpy()[0]
+
+                            # pts = preds[j]['pts3d' if j==0 else 'pts3d_in_other_view'].detach().cpu().numpy()[0]
+                            pts = pred_pts[j].cpu().numpy()[0]
+                            conf = preds[j]["conf"].cpu().data.numpy()[0]
+                            # mask = mask & (conf > 1.8)
+
+                            pts_gt = gt_pts[j].detach().cpu().numpy()[0]
+
+                            H, W = image.shape[:2]
+                            cx = W // 2
+                            cy = H // 2
+                            l, t = cx - 112, cy - 112
+                            r, b = cx + 112, cy + 112
+                            image = image[t:b, l:r]
+                            mask = mask[t:b, l:r]
+                            pts = pts[t:b, l:r]
+                            pts_gt = pts_gt[t:b, l:r]
+
+                            #### Align predicted 3D points to the ground truth
+                            pts[..., -1] += gt_shift_z.cpu().numpy().item()
+                            pts = geotrf(in_camera1, pts)
+
+                            pts_gt[..., -1] += gt_shift_z.cpu().numpy().item()
+                            pts_gt = geotrf(in_camera1, pts_gt)
+
+                            images_all.append((image[None, ...] + 1.0) / 2.0)
+                            pts_all.append(pts[None, ...])
+                            pts_gt_all.append(pts_gt[None, ...])
+                            masks_all.append(mask[None, ...])
+                            conf_all.append(conf[None, ...])
+
+                    images_all = np.concatenate(images_all, axis=0)
+                    pts_all = np.concatenate(pts_all, axis=0)
+                    pts_gt_all = np.concatenate(pts_gt_all, axis=0)
+                    masks_all = np.concatenate(masks_all, axis=0)
+
+                    scene_id = view["label"][0].rsplit("/", 1)[0]
+
+                    save_params = {}
+
+                    save_params["images_all"] = images_all
+                    save_params["pts_all"] = pts_all
+                    save_params["pts_gt_all"] = pts_gt_all
+                    save_params["masks_all"] = masks_all
+
+                    np.save(
+                        os.path.join(save_path, f"{scene_id.replace('/', '_')}.npy"),
+                        save_params,
+                    )
+
+                    if "DTU" in name_data:
+                        threshold = 100
+                    else:
+                        threshold = 0.1
+
+                    pts_all_masked = pts_all[masks_all > 0]
+                    pts_gt_all_masked = pts_gt_all[masks_all > 0]
+                    images_all_masked = images_all[masks_all > 0]
+
+                    pcd = o3d.geometry.PointCloud()
+                    pcd.points = o3d.utility.Vector3dVector(
+                        pts_all_masked.reshape(-1, 3)
+                    )
+                    pcd.colors = o3d.utility.Vector3dVector(
+                        images_all_masked.reshape(-1, 3)
+                    )
+                    o3d.io.write_point_cloud(
+                        os.path.join(
+                            save_path, f"{scene_id.replace('/', '_')}-mask.ply"
+                        ),
+                        pcd,
+                    )
+
+                    pcd_gt = o3d.geometry.PointCloud()
+                    pcd_gt.points = o3d.utility.Vector3dVector(
+                        pts_gt_all_masked.reshape(-1, 3)
+                    )
+                    pcd_gt.colors = o3d.utility.Vector3dVector(
+                        images_all_masked.reshape(-1, 3)
+                    )
+                    o3d.io.write_point_cloud(
+                        os.path.join(save_path, f"{scene_id.replace('/', '_')}-gt.ply"),
+                        pcd_gt,
+                    )
+
+                    trans_init = np.eye(4)
+
+                    reg_p2p = o3d.pipelines.registration.registration_icp(
+                        pcd,
+                        pcd_gt,
+                        threshold,
+                        trans_init,
+                        o3d.pipelines.registration.TransformationEstimationPointToPoint(),
+                    )
+
+                    transformation = reg_p2p.transformation
+
+                    pcd = pcd.transform(transformation)
+                    pcd.estimate_normals()
+                    pcd_gt.estimate_normals()
+
+                    gt_normal = np.asarray(pcd_gt.normals)
+                    pred_normal = np.asarray(pcd.normals)
+
+                    acc, acc_med, nc1, nc1_med = accuracy(
+                        pcd_gt.points, pcd.points, gt_normal, pred_normal
+                    )
+                    comp, comp_med, nc2, nc2_med = completion(
+                        pcd_gt.points, pcd.points, gt_normal, pred_normal
+                    )
+                    print(
+                        f"Idx: {scene_id}, Acc: {acc}, Comp: {comp}, NC1: {nc1}, NC2: {nc2} - Acc_med: {acc_med}, Compc_med: {comp_med}, NC1c_med: {nc1_med}, NC2c_med: {nc2_med}"
+                    )
+                    print(
+                        f"Idx: {scene_id}, Acc: {acc}, Comp: {comp}, NC1: {nc1}, NC2: {nc2} - Acc_med: {acc_med}, Compc_med: {comp_med}, NC1c_med: {nc1_med}, NC2c_med: {nc2_med}",
+                        file=open(log_file, "a"),
+                    )
+
+                    acc_all += acc
+                    comp_all += comp
+                    nc1_all += nc1
+                    nc2_all += nc2
+
+                    acc_all_med += acc_med
+                    comp_all_med += comp_med
+                    nc1_all_med += nc1_med
+                    nc2_all_med += nc2_med
+
+                    # release cuda memory
+                    torch.cuda.empty_cache()
+
+            accelerator.wait_for_everyone()
+            # Get depth from pcd and run TSDFusion
+            if accelerator.is_main_process:
+                to_write = ""
+                # Copy the error log from each process to the main error log
+                for i in range(8):
+                    if not os.path.exists(osp.join(save_path, f"logs_{i}.txt")):
+                        break
+                    with open(osp.join(save_path, f"logs_{i}.txt"), "r") as f_sub:
+                        to_write += f_sub.read()
+
+                with open(osp.join(save_path, f"logs_all.txt"), "w") as f:
+                    log_data = to_write
+                    metrics = defaultdict(list)
+                    for line in log_data.strip().split("\n"):
+                        match = regex.match(line)
+                        if match:
+                            data = match.groupdict()
+                            # Exclude 'scene_id' from metrics as it's an identifier
+                            for key, value in data.items():
+                                if key != "scene_id":
+                                    metrics[key].append(float(value))
+                            metrics["nc"].append(
+                                (float(data["nc1"]) + float(data["nc2"])) / 2
+                            )
+                            metrics["nc_med"].append(
+                                (float(data["nc1_med"]) + float(data["nc2_med"])) / 2
+                            )
+                    mean_metrics = {
+                        metric: sum(values) / len(values)
+                        for metric, values in metrics.items()
+                    }
+
+                    c_name = "mean"
+                    print_str = f"{c_name.ljust(20)}: "
+                    for m_name in mean_metrics:
+                        print_num = np.mean(mean_metrics[m_name])
+                        print_str = print_str + f"{m_name}: {print_num:.3f} | "
+                    print_str = print_str + "\n"
+                    f.write(to_write + print_str)
+
+
+from collections import defaultdict
+import re
+
+pattern = r"""
+    Idx:\s*(?P<scene_id>[^,]+),\s*
+    Acc:\s*(?P<acc>[^,]+),\s*
+    Comp:\s*(?P<comp>[^,]+),\s*
+    NC1:\s*(?P<nc1>[^,]+),\s*
+    NC2:\s*(?P<nc2>[^,]+)\s*-\s*
+    Acc_med:\s*(?P<acc_med>[^,]+),\s*
+    Compc_med:\s*(?P<comp_med>[^,]+),\s*
+    NC1c_med:\s*(?P<nc1_med>[^,]+),\s*
+    NC2c_med:\s*(?P<nc2_med>[^,]+)
+"""
+
+regex = re.compile(pattern, re.VERBOSE)
+
+
+if __name__ == "__main__":
+    parser = get_args_parser()
+    args = parser.parse_args()
+
+    main(args)
diff --git a/extern/CUT3R/eval/mv_recon/run.sh b/extern/CUT3R/eval/mv_recon/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..77b9d3fedfeec1592edbf6f24ad5b8dee569167f
--- /dev/null
+++ b/extern/CUT3R/eval/mv_recon/run.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -e
+
+workdir='.'
+model_name='ours'
+ckpt_name='cut3r_512_dpt_4_64'
+model_weights="${workdir}/src/${ckpt_name}.pth"
+
+output_dir="${workdir}/eval_results/mv_recon/${model_name}_${ckpt_name}"
+echo "$output_dir"
+accelerate launch --num_processes 8 --main_process_port 29501 eval/mv_recon/launch.py \
+    --weights "$model_weights" \
+    --output_dir "$output_dir" \
+    --model_name "$model_name"
\ No newline at end of file
diff --git a/extern/CUT3R/eval/mv_recon/utils.py b/extern/CUT3R/eval/mv_recon/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..546f8535e9d2f9ea25efec4d12b7b655bf940568
--- /dev/null
+++ b/extern/CUT3R/eval/mv_recon/utils.py
@@ -0,0 +1,59 @@
+import numpy as np
+from scipy.spatial import cKDTree as KDTree
+import torch
+
+
+def completion_ratio(gt_points, rec_points, dist_th=0.05):
+    gen_points_kd_tree = KDTree(rec_points)
+    distances, _ = gen_points_kd_tree.query(gt_points)
+    comp_ratio = np.mean((distances < dist_th).astype(np.float32))
+    return comp_ratio
+
+
+def accuracy(gt_points, rec_points, gt_normals=None, rec_normals=None):
+    gt_points_kd_tree = KDTree(gt_points)
+    distances, idx = gt_points_kd_tree.query(rec_points, workers=-1)
+    acc = np.mean(distances)
+
+    acc_median = np.median(distances)
+
+    if gt_normals is not None and rec_normals is not None:
+        normal_dot = np.sum(gt_normals[idx] * rec_normals, axis=-1)
+        normal_dot = np.abs(normal_dot)
+
+        return acc, acc_median, np.mean(normal_dot), np.median(normal_dot)
+
+    return acc, acc_median
+
+
+def completion(gt_points, rec_points, gt_normals=None, rec_normals=None):
+    gt_points_kd_tree = KDTree(rec_points)
+    distances, idx = gt_points_kd_tree.query(gt_points, workers=-1)
+    comp = np.mean(distances)
+    comp_median = np.median(distances)
+
+    if gt_normals is not None and rec_normals is not None:
+        normal_dot = np.sum(gt_normals * rec_normals[idx], axis=-1)
+        normal_dot = np.abs(normal_dot)
+
+        return comp, comp_median, np.mean(normal_dot), np.median(normal_dot)
+
+    return comp, comp_median
+
+
+def compute_iou(pred_vox, target_vox):
+    # Get voxel indices
+    v_pred_indices = [voxel.grid_index for voxel in pred_vox.get_voxels()]
+    v_target_indices = [voxel.grid_index for voxel in target_vox.get_voxels()]
+
+    # Convert to sets for set operations
+    v_pred_filled = set(tuple(np.round(x, 4)) for x in v_pred_indices)
+    v_target_filled = set(tuple(np.round(x, 4)) for x in v_target_indices)
+
+    # Compute intersection and union
+    intersection = v_pred_filled & v_target_filled
+    union = v_pred_filled | v_target_filled
+
+    # Compute IoU
+    iou = len(intersection) / len(union)
+    return iou
diff --git a/extern/CUT3R/eval/relpose/evo_utils.py b/extern/CUT3R/eval/relpose/evo_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..513c6d05b6ed6fe47289297aa3fb668097d4a193
--- /dev/null
+++ b/extern/CUT3R/eval/relpose/evo_utils.py
@@ -0,0 +1,427 @@
+import os
+import re
+from copy import deepcopy
+from pathlib import Path
+
+import evo.main_ape as main_ape
+import evo.main_rpe as main_rpe
+import matplotlib.pyplot as plt
+import numpy as np
+from evo.core import sync
+from evo.core.metrics import PoseRelation, Unit
+from evo.core.trajectory import PosePath3D, PoseTrajectory3D
+from evo.tools import file_interface, plot
+from scipy.spatial.transform import Rotation
+from evo.core import metrics
+
+
+def sintel_cam_read(filename):
+    """Read camera data, return (M,N) tuple.
+
+    M is the intrinsic matrix, N is the extrinsic matrix, so that
+
+    x = M*N*X,
+    with x being a point in homogeneous image pixel coordinates, X being a
+    point in homogeneous world coordinates.
+    """
+    TAG_FLOAT = 202021.25
+
+    f = open(filename, "rb")
+    check = np.fromfile(f, dtype=np.float32, count=1)[0]
+    assert (
+        check == TAG_FLOAT
+    ), " cam_read:: Wrong tag in flow file (should be: {0}, is: {1}). Big-endian machine? ".format(
+        TAG_FLOAT, check
+    )
+    M = np.fromfile(f, dtype="float64", count=9).reshape((3, 3))
+    N = np.fromfile(f, dtype="float64", count=12).reshape((3, 4))
+    return M, N
+
+
+def load_replica_traj(gt_file):
+    traj_w_c = np.loadtxt(gt_file)
+    assert traj_w_c.shape[1] == 12 or traj_w_c.shape[1] == 16
+    poses = [
+        np.array(
+            [
+                [r[0], r[1], r[2], r[3]],
+                [r[4], r[5], r[6], r[7]],
+                [r[8], r[9], r[10], r[11]],
+                [0, 0, 0, 1],
+            ]
+        )
+        for r in traj_w_c
+    ]
+
+    pose_path = PosePath3D(poses_se3=poses)
+    timestamps_mat = np.arange(traj_w_c.shape[0]).astype(float)
+
+    traj = PoseTrajectory3D(poses_se3=pose_path.poses_se3, timestamps=timestamps_mat)
+    xyz = traj.positions_xyz
+    # shift -1 column -> w in back column
+    # quat = np.roll(traj.orientations_quat_wxyz, -1, axis=1)
+    # uncomment this line if the quaternion is in scalar-first format
+    quat = traj.orientations_quat_wxyz
+
+    traj_tum = np.column_stack((xyz, quat))
+    return (traj_tum, timestamps_mat)
+
+
+def load_sintel_traj(gt_file):  # './data/sintel/training/camdata_left/alley_2'
+    # Refer to ParticleSfM
+    gt_pose_lists = sorted(os.listdir(gt_file))
+    gt_pose_lists = [
+        os.path.join(gt_file, x) for x in gt_pose_lists if x.endswith(".cam")
+    ]
+    tstamps = [float(x.split("/")[-1][:-4].split("_")[-1]) for x in gt_pose_lists]
+    gt_poses = [
+        sintel_cam_read(f)[1] for f in gt_pose_lists
+    ]  # [1] means get the extrinsic
+    xyzs, wxyzs = [], []
+    tum_gt_poses = []
+    for gt_pose in gt_poses:
+        gt_pose = np.concatenate([gt_pose, np.array([[0, 0, 0, 1]])], 0)
+        gt_pose_inv = np.linalg.inv(gt_pose)  # world2cam -> cam2world
+        xyz = gt_pose_inv[:3, -1]
+        xyzs.append(xyz)
+        R = Rotation.from_matrix(gt_pose_inv[:3, :3])
+        xyzw = R.as_quat()  # scalar-last for scipy
+        wxyz = np.array([xyzw[-1], xyzw[0], xyzw[1], xyzw[2]])
+        wxyzs.append(wxyz)
+        tum_gt_pose = np.concatenate([xyz, wxyz], 0)  # TODO: check if this is correct
+        tum_gt_poses.append(tum_gt_pose)
+
+    tum_gt_poses = np.stack(tum_gt_poses, 0)
+    tum_gt_poses[:, :3] = tum_gt_poses[:, :3] - np.mean(
+        tum_gt_poses[:, :3], 0, keepdims=True
+    )
+    tt = np.expand_dims(np.stack(tstamps, 0), -1)
+    return tum_gt_poses, tt
+
+
+def load_traj(gt_traj_file, traj_format="sintel", skip=0, stride=1, num_frames=None):
+    """Read trajectory format. Return in TUM-RGBD format.
+    Returns:
+        traj_tum (N, 7): camera to world poses in (x,y,z,qx,qy,qz,qw)
+        timestamps_mat (N, 1): timestamps
+    """
+    if traj_format == "replica":
+        traj_tum, timestamps_mat = load_replica_traj(gt_traj_file)
+    elif traj_format == "sintel":
+        traj_tum, timestamps_mat = load_sintel_traj(gt_traj_file)
+    elif traj_format in ["tum", "tartanair"]:
+        traj = file_interface.read_tum_trajectory_file(gt_traj_file)
+        xyz = traj.positions_xyz
+        quat = traj.orientations_quat_wxyz
+        timestamps_mat = traj.timestamps
+        traj_tum = np.column_stack((xyz, quat))
+    else:
+        raise NotImplementedError
+
+    traj_tum = traj_tum[skip::stride]
+    timestamps_mat = timestamps_mat[skip::stride]
+    if num_frames is not None:
+        traj_tum = traj_tum[:num_frames]
+        timestamps_mat = timestamps_mat[:num_frames]
+    return traj_tum, timestamps_mat
+
+
+def update_timestamps(gt_file, traj_format, skip=0, stride=1):
+    """Update timestamps given a"""
+    if traj_format == "tum":
+        traj_t_map_file = gt_file.replace("groundtruth.txt", "rgb.txt")
+        timestamps = load_timestamps(traj_t_map_file, traj_format)
+        return timestamps[skip::stride]
+    elif traj_format == "tartanair":
+        traj_t_map_file = gt_file.replace("gt_pose.txt", "times.txt")
+        timestamps = load_timestamps(traj_t_map_file, traj_format)
+        return timestamps[skip::stride]
+
+
+def load_timestamps(time_file, traj_format="replica"):
+    if traj_format in ["tum", "tartanair"]:
+        with open(time_file, "r+") as f:
+            lines = f.readlines()
+        timestamps_mat = [
+            float(x.split(" ")[0]) for x in lines if not x.startswith("#")
+        ]
+        return timestamps_mat
+
+
+def make_traj(args) -> PoseTrajectory3D:
+    if isinstance(args, tuple) or isinstance(args, list):
+        traj, tstamps = args
+        return PoseTrajectory3D(
+            positions_xyz=traj[:, :3],
+            orientations_quat_wxyz=traj[:, 3:],
+            timestamps=tstamps,
+        )
+    assert isinstance(args, PoseTrajectory3D), type(args)
+    return deepcopy(args)
+
+
+def eval_metrics(pred_traj, gt_traj=None, seq="", filename="", sample_stride=1):
+
+    if sample_stride > 1:
+        pred_traj[0] = pred_traj[0][::sample_stride]
+        pred_traj[1] = pred_traj[1][::sample_stride]
+        if gt_traj is not None:
+            updated_gt_traj = []
+            updated_gt_traj.append(gt_traj[0][::sample_stride])
+            updated_gt_traj.append(gt_traj[1][::sample_stride])
+            gt_traj = updated_gt_traj
+
+    pred_traj = make_traj(pred_traj)
+
+    if gt_traj is not None:
+        gt_traj = make_traj(gt_traj)
+
+        if pred_traj.timestamps.shape[0] == gt_traj.timestamps.shape[0]:
+            pred_traj.timestamps = gt_traj.timestamps
+        else:
+            print(pred_traj.timestamps.shape[0], gt_traj.timestamps.shape[0])
+
+        gt_traj, pred_traj = sync.associate_trajectories(gt_traj, pred_traj)
+
+    # ATE
+    traj_ref = gt_traj
+    traj_est = pred_traj
+
+    ate_result = main_ape.ape(
+        traj_ref,
+        traj_est,
+        est_name="traj",
+        pose_relation=PoseRelation.translation_part,
+        align=True,
+        correct_scale=True,
+    )
+
+    ate = ate_result.stats["rmse"]
+    # print(ate_result.np_arrays['error_array'])
+    # exit()
+
+    # RPE rotation and translation
+    delta_list = [1]
+    rpe_rots, rpe_transs = [], []
+    for delta in delta_list:
+        rpe_rots_result = main_rpe.rpe(
+            traj_ref,
+            traj_est,
+            est_name="traj",
+            pose_relation=PoseRelation.rotation_angle_deg,
+            align=True,
+            correct_scale=True,
+            delta=delta,
+            delta_unit=Unit.frames,
+            rel_delta_tol=0.01,
+            all_pairs=True,
+        )
+
+        rot = rpe_rots_result.stats["rmse"]
+        rpe_rots.append(rot)
+
+    for delta in delta_list:
+        rpe_transs_result = main_rpe.rpe(
+            traj_ref,
+            traj_est,
+            est_name="traj",
+            pose_relation=PoseRelation.translation_part,
+            align=True,
+            correct_scale=True,
+            delta=delta,
+            delta_unit=Unit.frames,
+            rel_delta_tol=0.01,
+            all_pairs=True,
+        )
+
+        trans = rpe_transs_result.stats["rmse"]
+        rpe_transs.append(trans)
+
+    rpe_trans, rpe_rot = np.mean(rpe_transs), np.mean(rpe_rots)
+    with open(filename, "w+") as f:
+        f.write(f"Seq: {seq} \n\n")
+        f.write(f"{ate_result}")
+        f.write(f"{rpe_rots_result}")
+        f.write(f"{rpe_transs_result}")
+
+    print(f"Save results to {filename}")
+    return ate, rpe_trans, rpe_rot
+
+
+def eval_metrics_first_pose_align_last_pose(
+    pred_traj, gt_traj=None, seq="", filename="", figpath="", sample_stride=1
+):
+    if sample_stride > 1:
+        pred_traj[0] = pred_traj[0][::sample_stride]
+        pred_traj[1] = pred_traj[1][::sample_stride]
+        if gt_traj is not None:
+            gt_traj = [gt_traj[0][::sample_stride], gt_traj[1][::sample_stride]]
+    pred_traj = make_traj(pred_traj)
+    if gt_traj is not None:
+        gt_traj = make_traj(gt_traj)
+
+        if pred_traj.timestamps.shape[0] == gt_traj.timestamps.shape[0]:
+            pred_traj.timestamps = gt_traj.timestamps
+        else:
+            print(
+                "Different number of poses:",
+                pred_traj.timestamps.shape[0],
+                gt_traj.timestamps.shape[0],
+            )
+
+        gt_traj, pred_traj = sync.associate_trajectories(gt_traj, pred_traj)
+
+    if gt_traj is not None and pred_traj is not None:
+        if len(gt_traj.poses_se3) > 0 and len(pred_traj.poses_se3) > 0:
+            first_gt_pose = gt_traj.poses_se3[0]
+            first_pred_pose = pred_traj.poses_se3[0]
+            # T = (first_gt_pose) * inv(first_pred_pose)
+            T = first_gt_pose @ np.linalg.inv(first_pred_pose)
+
+            # Apply T to every predicted pose
+            aligned_pred_poses = []
+            for pose in pred_traj.poses_se3:
+                aligned_pred_poses.append(T @ pose)
+            aligned_pred_traj = PoseTrajectory3D(
+                poses_se3=aligned_pred_poses,
+                timestamps=np.array(pred_traj.timestamps),
+                # optionally copy other fields if your make_traj object has them
+            )
+            pred_traj = aligned_pred_traj  # .poses_se3 = aligned_pred_poses
+        plot_trajectory(
+            pred_traj,
+            gt_traj,
+            title=seq,
+            filename=figpath,
+            align=False,
+            correct_scale=False,
+        )
+
+    if gt_traj is not None and len(gt_traj.poses_se3) > 0:
+        gt_traj = PoseTrajectory3D(
+            poses_se3=[gt_traj.poses_se3[-1]], timestamps=[gt_traj.timestamps[-1]]
+        )
+    if pred_traj is not None and len(pred_traj.poses_se3) > 0:
+        pred_traj = PoseTrajectory3D(
+            poses_se3=[pred_traj.poses_se3[-1]], timestamps=[pred_traj.timestamps[-1]]
+        )
+
+    ate_result = main_ape.ape(
+        gt_traj,
+        pred_traj,
+        est_name="traj",
+        pose_relation=PoseRelation.translation_part,
+        align=False,  # <-- important
+        correct_scale=False,  # <-- important
+    )
+    ate = ate_result.stats["rmse"]
+    with open(filename, "w+") as f:
+        f.write(f"Seq: {seq}\n\n")
+        f.write(f"{ate_result}")
+
+    print(f"Save results to {filename}")
+
+    return ate
+
+
+def best_plotmode(traj):
+    _, i1, i2 = np.argsort(np.var(traj.positions_xyz, axis=0))
+    plot_axes = "xyz"[i2] + "xyz"[i1]
+    return getattr(plot.PlotMode, plot_axes)
+
+
+def plot_trajectory(
+    pred_traj, gt_traj=None, title="", filename="", align=True, correct_scale=True
+):
+    pred_traj = make_traj(pred_traj)
+
+    if gt_traj is not None:
+        gt_traj = make_traj(gt_traj)
+        if pred_traj.timestamps.shape[0] == gt_traj.timestamps.shape[0]:
+            pred_traj.timestamps = gt_traj.timestamps
+        else:
+            print("WARNING", pred_traj.timestamps.shape[0], gt_traj.timestamps.shape[0])
+
+        gt_traj, pred_traj = sync.associate_trajectories(gt_traj, pred_traj)
+
+        if align:
+            pred_traj.align(gt_traj, correct_scale=correct_scale)
+
+    plot_collection = plot.PlotCollection("PlotCol")
+    fig = plt.figure(figsize=(8, 8))
+    plot_mode = best_plotmode(gt_traj if (gt_traj is not None) else pred_traj)
+    ax = plot.prepare_axis(fig, plot_mode)
+    ax.set_title(title)
+    if gt_traj is not None:
+        plot.traj(ax, plot_mode, gt_traj, "--", "gray", "Ground Truth")
+    plot.traj(ax, plot_mode, pred_traj, "-", "blue", "Predicted")
+    plot_collection.add_figure("traj_error", fig)
+    plot_collection.export(filename, confirm_overwrite=False)
+    plt.close(fig=fig)
+    print(f"Saved trajectory to {filename.replace('.png','')}_traj_error.png")
+
+
+def save_trajectory_tum_format(traj, filename):
+    traj = make_traj(traj)
+    tostr = lambda a: " ".join(map(str, a))
+    with Path(filename).open("w") as f:
+        for i in range(traj.num_poses):
+            f.write(
+                f"{traj.timestamps[i]} {tostr(traj.positions_xyz[i])} {tostr(traj.orientations_quat_wxyz[i][[0,1,2,3]])}\n"
+            )
+    print(f"Saved trajectory to {filename}")
+
+
+def extract_metrics(file_path):
+    with open(file_path, "r") as file:
+        content = file.read()
+
+    # Extract metrics using regex
+    ate_match = re.search(
+        r"APE w.r.t. translation part \(m\).*?rmse\s+([0-9.]+)", content, re.DOTALL
+    )
+    rpe_trans_match = re.search(
+        r"RPE w.r.t. translation part \(m\).*?rmse\s+([0-9.]+)", content, re.DOTALL
+    )
+    rpe_rot_match = re.search(
+        r"RPE w.r.t. rotation angle in degrees \(deg\).*?rmse\s+([0-9.]+)",
+        content,
+        re.DOTALL,
+    )
+
+    ate = float(ate_match.group(1)) if ate_match else 0.0
+    rpe_trans = float(rpe_trans_match.group(1)) if rpe_trans_match else 0.0
+    rpe_rot = float(rpe_rot_match.group(1)) if rpe_rot_match else 0.0
+
+    return ate, rpe_trans, rpe_rot
+
+
+def process_directory(directory):
+    results = []
+    for root, _, files in os.walk(directory):
+        if files is not None:
+            files = sorted(files)
+        for file in files:
+            if file.endswith("_metric.txt"):
+                file_path = os.path.join(root, file)
+                seq_name = file.replace("_eval_metric.txt", "")
+                ate, rpe_trans, rpe_rot = extract_metrics(file_path)
+                results.append((seq_name, ate, rpe_trans, rpe_rot))
+
+    return results
+
+
+def calculate_averages(results):
+    total_ate = sum(r[1] for r in results)
+    total_rpe_trans = sum(r[2] for r in results)
+    total_rpe_rot = sum(r[3] for r in results)
+    count = len(results)
+
+    if count == 0:
+        return 0.0, 0.0, 0.0
+
+    avg_ate = total_ate / count
+    avg_rpe_trans = total_rpe_trans / count
+    avg_rpe_rot = total_rpe_rot / count
+
+    return avg_ate, avg_rpe_trans, avg_rpe_rot
diff --git a/extern/CUT3R/eval/relpose/launch.py b/extern/CUT3R/eval/relpose/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..d405029ead9ad63f701fb2dd95bdba6c6673efd5
--- /dev/null
+++ b/extern/CUT3R/eval/relpose/launch.py
@@ -0,0 +1,449 @@
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+import math
+import cv2
+import numpy as np
+import torch
+import argparse
+
+from copy import deepcopy
+from eval.relpose.metadata import dataset_metadata
+from eval.relpose.utils import *
+
+from accelerate import PartialState
+from add_ckpt_path import add_path_to_dust3r
+
+from tqdm import tqdm
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--weights",
+        type=str,
+        help="path to the model weights",
+        default="",
+    )
+
+    parser.add_argument("--device", type=str, default="cuda", help="pytorch device")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="",
+        help="value for outdir",
+    )
+    parser.add_argument(
+        "--no_crop", type=bool, default=True, help="whether to crop input data"
+    )
+
+    parser.add_argument(
+        "--eval_dataset",
+        type=str,
+        default="sintel",
+        choices=list(dataset_metadata.keys()),
+    )
+    parser.add_argument("--size", type=int, default="224")
+
+    parser.add_argument(
+        "--pose_eval_stride", default=1, type=int, help="stride for pose evaluation"
+    )
+    parser.add_argument("--shuffle", action="store_true", default=False)
+    parser.add_argument(
+        "--full_seq",
+        action="store_true",
+        default=False,
+        help="use full sequence for pose evaluation",
+    )
+    parser.add_argument(
+        "--seq_list",
+        nargs="+",
+        default=None,
+        help="list of sequences for pose evaluation",
+    )
+
+    parser.add_argument("--revisit", type=int, default=1)
+    parser.add_argument("--freeze_state", action="store_true", default=False)
+    parser.add_argument("--solve_pose", action="store_true", default=False)
+    return parser
+
+
+def eval_pose_estimation(args, model, save_dir=None):
+    metadata = dataset_metadata.get(args.eval_dataset)
+    img_path = metadata["img_path"]
+    mask_path = metadata["mask_path"]
+
+    ate_mean, rpe_trans_mean, rpe_rot_mean = eval_pose_estimation_dist(
+        args, model, save_dir=save_dir, img_path=img_path, mask_path=mask_path
+    )
+    return ate_mean, rpe_trans_mean, rpe_rot_mean
+
+
+def eval_pose_estimation_dist(args, model, img_path, save_dir=None, mask_path=None):
+    from dust3r.inference import inference
+
+    metadata = dataset_metadata.get(args.eval_dataset)
+    anno_path = metadata.get("anno_path", None)
+
+    seq_list = args.seq_list
+    if seq_list is None:
+        if metadata.get("full_seq", False):
+            args.full_seq = True
+        else:
+            seq_list = metadata.get("seq_list", [])
+        if args.full_seq:
+            seq_list = os.listdir(img_path)
+            seq_list = [
+                seq for seq in seq_list if os.path.isdir(os.path.join(img_path, seq))
+            ]
+        seq_list = sorted(seq_list)
+
+    if save_dir is None:
+        save_dir = args.output_dir
+
+    distributed_state = PartialState()
+    model.to(distributed_state.device)
+    device = distributed_state.device
+
+    with distributed_state.split_between_processes(seq_list) as seqs:
+        ate_list = []
+        rpe_trans_list = []
+        rpe_rot_list = []
+        load_img_size = args.size
+        error_log_path = f"{save_dir}/_error_log_{distributed_state.process_index}.txt"  # Unique log file per process
+        bug = False
+        for seq in tqdm(seqs):
+            try:
+                dir_path = metadata["dir_path_func"](img_path, seq)
+
+                # Handle skip_condition
+                skip_condition = metadata.get("skip_condition", None)
+                if skip_condition is not None and skip_condition(save_dir, seq):
+                    continue
+
+                mask_path_seq_func = metadata.get(
+                    "mask_path_seq_func", lambda mask_path, seq: None
+                )
+                mask_path_seq = mask_path_seq_func(mask_path, seq)
+
+                filelist = [
+                    os.path.join(dir_path, name) for name in os.listdir(dir_path)
+                ]
+                filelist.sort()
+                filelist = filelist[:: args.pose_eval_stride]
+
+                views = prepare_input(
+                    filelist,
+                    [True for _ in filelist],
+                    size=load_img_size,
+                    crop=not args.no_crop,
+                    revisit=args.revisit,
+                    update=not args.freeze_state,
+                )
+                outputs, _ = inference(views, model, device)
+
+                (
+                    colors,
+                    pts3ds_self,
+                    pts3ds_other,
+                    conf_self,
+                    conf_other,
+                    cam_dict,
+                    pr_poses,
+                ) = prepare_output(
+                    outputs, revisit=args.revisit, solve_pose=args.solve_pose
+                )
+
+                pred_traj = get_tum_poses(pr_poses)
+                os.makedirs(f"{save_dir}/{seq}", exist_ok=True)
+                save_tum_poses(pr_poses, f"{save_dir}/{seq}/pred_traj.txt")
+                save_focals(cam_dict, f"{save_dir}/{seq}/pred_focal.txt")
+                save_intrinsics(cam_dict, f"{save_dir}/{seq}/pred_intrinsics.txt")
+                # save_depth_maps(pts3ds_self,f'{save_dir}/{seq}', conf_self=conf_self)
+                # save_conf_maps(conf_self,f'{save_dir}/{seq}')
+                # save_rgb_imgs(colors,f'{save_dir}/{seq}')
+
+                gt_traj_file = metadata["gt_traj_func"](img_path, anno_path, seq)
+                traj_format = metadata.get("traj_format", None)
+
+                if args.eval_dataset == "sintel":
+                    gt_traj = load_traj(
+                        gt_traj_file=gt_traj_file, stride=args.pose_eval_stride
+                    )
+                elif traj_format is not None:
+                    gt_traj = load_traj(
+                        gt_traj_file=gt_traj_file,
+                        traj_format=traj_format,
+                        stride=args.pose_eval_stride,
+                    )
+                else:
+                    gt_traj = None
+
+                if gt_traj is not None:
+                    ate, rpe_trans, rpe_rot = eval_metrics(
+                        pred_traj,
+                        gt_traj,
+                        seq=seq,
+                        filename=f"{save_dir}/{seq}_eval_metric.txt",
+                    )
+                    plot_trajectory(
+                        pred_traj, gt_traj, title=seq, filename=f"{save_dir}/{seq}.png"
+                    )
+                else:
+                    ate, rpe_trans, rpe_rot = 0, 0, 0
+                    bug = True
+
+                ate_list.append(ate)
+                rpe_trans_list.append(rpe_trans)
+                rpe_rot_list.append(rpe_rot)
+
+                # Write to error log after each sequence
+                with open(error_log_path, "a") as f:
+                    f.write(
+                        f"{args.eval_dataset}-{seq: <16} | ATE: {ate:.5f}, RPE trans: {rpe_trans:.5f}, RPE rot: {rpe_rot:.5f}\n"
+                    )
+                    f.write(f"{ate:.5f}\n")
+                    f.write(f"{rpe_trans:.5f}\n")
+                    f.write(f"{rpe_rot:.5f}\n")
+
+            except Exception as e:
+                if "out of memory" in str(e):
+                    # Handle OOM
+                    torch.cuda.empty_cache()  # Clear the CUDA memory
+                    with open(error_log_path, "a") as f:
+                        f.write(
+                            f"OOM error in sequence {seq}, skipping this sequence.\n"
+                        )
+                    print(f"OOM error in sequence {seq}, skipping...")
+                elif "Degenerate covariance rank" in str(
+                    e
+                ) or "Eigenvalues did not converge" in str(e):
+                    # Handle Degenerate covariance rank exception and Eigenvalues did not converge exception
+                    with open(error_log_path, "a") as f:
+                        f.write(f"Exception in sequence {seq}: {str(e)}\n")
+                    print(f"Traj evaluation error in sequence {seq}, skipping.")
+                else:
+                    raise e  # Rethrow if it's not an expected exception
+
+    distributed_state.wait_for_everyone()
+
+    results = process_directory(save_dir)
+    avg_ate, avg_rpe_trans, avg_rpe_rot = calculate_averages(results)
+
+    # Write the averages to the error log (only on the main process)
+    if distributed_state.is_main_process:
+        with open(f"{save_dir}/_error_log.txt", "a") as f:
+            # Copy the error log from each process to the main error log
+            for i in range(distributed_state.num_processes):
+                if not os.path.exists(f"{save_dir}/_error_log_{i}.txt"):
+                    break
+                with open(f"{save_dir}/_error_log_{i}.txt", "r") as f_sub:
+                    f.write(f_sub.read())
+            f.write(
+                f"Average ATE: {avg_ate:.5f}, Average RPE trans: {avg_rpe_trans:.5f}, Average RPE rot: {avg_rpe_rot:.5f}\n"
+            )
+
+    return avg_ate, avg_rpe_trans, avg_rpe_rot
+
+
+if __name__ == "__main__":
+    args = get_args_parser()
+    args = args.parse_args()
+    add_path_to_dust3r(args.weights)
+    from dust3r.utils.image import load_images_for_eval as load_images
+    from dust3r.post_process import estimate_focal_knowing_depth
+    from dust3r.model import ARCroco3DStereo
+    from dust3r.utils.camera import pose_encoding_to_camera
+    from dust3r.utils.geometry import weighted_procrustes, geotrf
+
+    args.full_seq = False
+    args.no_crop = False
+
+    def recover_cam_params(pts3ds_self, pts3ds_other, conf_self, conf_other):
+        B, H, W, _ = pts3ds_self.shape
+        pp = (
+            torch.tensor([W // 2, H // 2], device=pts3ds_self.device)
+            .float()
+            .repeat(B, 1)
+            .reshape(B, 1, 2)
+        )
+        focal = estimate_focal_knowing_depth(pts3ds_self, pp, focal_mode="weiszfeld")
+
+        pts3ds_self = pts3ds_self.reshape(B, -1, 3)
+        pts3ds_other = pts3ds_other.reshape(B, -1, 3)
+        conf_self = conf_self.reshape(B, -1)
+        conf_other = conf_other.reshape(B, -1)
+        # weighted procrustes
+        c2w = weighted_procrustes(
+            pts3ds_self,
+            pts3ds_other,
+            torch.log(conf_self) * torch.log(conf_other),
+            use_weights=True,
+            return_T=True,
+        )
+        return c2w, focal, pp.reshape(B, 2)
+
+    def prepare_input(
+        img_paths,
+        img_mask,
+        size,
+        raymaps=None,
+        raymap_mask=None,
+        revisit=1,
+        update=True,
+        crop=True,
+    ):
+        images = load_images(img_paths, size=size, crop=crop)
+        views = []
+        if raymaps is None and raymap_mask is None:
+            num_views = len(images)
+
+            for i in range(num_views):
+                view = {
+                    "img": images[i]["img"],
+                    "ray_map": torch.full(
+                        (
+                            images[i]["img"].shape[0],
+                            6,
+                            images[i]["img"].shape[-2],
+                            images[i]["img"].shape[-1],
+                        ),
+                        torch.nan,
+                    ),
+                    "true_shape": torch.from_numpy(images[i]["true_shape"]),
+                    "idx": i,
+                    "instance": str(i),
+                    "camera_pose": torch.from_numpy(
+                        np.eye(4).astype(np.float32)
+                    ).unsqueeze(0),
+                    "img_mask": torch.tensor(True).unsqueeze(0),
+                    "ray_mask": torch.tensor(False).unsqueeze(0),
+                    "update": torch.tensor(True).unsqueeze(0),
+                    "reset": torch.tensor(False).unsqueeze(0),
+                }
+                views.append(view)
+        else:
+
+            num_views = len(images) + len(raymaps)
+            assert len(img_mask) == len(raymap_mask) == num_views
+            assert sum(img_mask) == len(images) and sum(raymap_mask) == len(raymaps)
+
+            j = 0
+            k = 0
+            for i in range(num_views):
+                view = {
+                    "img": (
+                        images[j]["img"]
+                        if img_mask[i]
+                        else torch.full_like(images[0]["img"], torch.nan)
+                    ),
+                    "ray_map": (
+                        raymaps[k]
+                        if raymap_mask[i]
+                        else torch.full_like(raymaps[0], torch.nan)
+                    ),
+                    "true_shape": (
+                        torch.from_numpy(images[j]["true_shape"])
+                        if img_mask[i]
+                        else torch.from_numpy(np.int32([raymaps[k].shape[1:-1][::-1]]))
+                    ),
+                    "idx": i,
+                    "instance": str(i),
+                    "camera_pose": torch.from_numpy(
+                        np.eye(4).astype(np.float32)
+                    ).unsqueeze(0),
+                    "img_mask": torch.tensor(img_mask[i]).unsqueeze(0),
+                    "ray_mask": torch.tensor(raymap_mask[i]).unsqueeze(0),
+                    "update": torch.tensor(img_mask[i]).unsqueeze(0),
+                    "reset": torch.tensor(False).unsqueeze(0),
+                }
+                if img_mask[i]:
+                    j += 1
+                if raymap_mask[i]:
+                    k += 1
+                views.append(view)
+            assert j == len(images) and k == len(raymaps)
+
+        if revisit > 1:
+            # repeat input for 'revisit' times
+            new_views = []
+            for r in range(revisit):
+                for i in range(len(views)):
+                    new_view = deepcopy(views[i])
+                    new_view["idx"] = r * len(views) + i
+                    new_view["instance"] = str(r * len(views) + i)
+                    if r > 0:
+                        if not update:
+                            new_view["update"] = torch.tensor(False).unsqueeze(0)
+                    new_views.append(new_view)
+            return new_views
+        return views
+
+    def prepare_output(outputs, revisit=1, solve_pose=False):
+        valid_length = len(outputs["pred"]) // revisit
+        outputs["pred"] = outputs["pred"][-valid_length:]
+        outputs["views"] = outputs["views"][-valid_length:]
+
+        if solve_pose:
+            pts3ds_self = [
+                output["pts3d_in_self_view"].cpu() for output in outputs["pred"]
+            ]
+            pts3ds_other = [
+                output["pts3d_in_other_view"].cpu() for output in outputs["pred"]
+            ]
+            conf_self = [output["conf_self"].cpu() for output in outputs["pred"]]
+            conf_other = [output["conf"].cpu() for output in outputs["pred"]]
+            pr_poses, focal, pp = recover_cam_params(
+                torch.cat(pts3ds_self, 0),
+                torch.cat(pts3ds_other, 0),
+                torch.cat(conf_self, 0),
+                torch.cat(conf_other, 0),
+            )
+            pts3ds_self = torch.cat(pts3ds_self, 0)
+        else:
+
+            pts3ds_self = [
+                output["pts3d_in_self_view"].cpu() for output in outputs["pred"]
+            ]
+            pts3ds_other = [
+                output["pts3d_in_other_view"].cpu() for output in outputs["pred"]
+            ]
+            conf_self = [output["conf_self"].cpu() for output in outputs["pred"]]
+            conf_other = [output["conf"].cpu() for output in outputs["pred"]]
+            pts3ds_self = torch.cat(pts3ds_self, 0)
+            pr_poses = [
+                pose_encoding_to_camera(pred["camera_pose"].clone()).cpu()
+                for pred in outputs["pred"]
+            ]
+            pr_poses = torch.cat(pr_poses, 0)
+
+            B, H, W, _ = pts3ds_self.shape
+            pp = (
+                torch.tensor([W // 2, H // 2], device=pts3ds_self.device)
+                .float()
+                .repeat(B, 1)
+                .reshape(B, 2)
+            )
+            focal = estimate_focal_knowing_depth(
+                pts3ds_self, pp, focal_mode="weiszfeld"
+            )
+
+        colors = [0.5 * (output["rgb"][0] + 1.0) for output in outputs["pred"]]
+        cam_dict = {
+            "focal": focal.cpu().numpy(),
+            "pp": pp.cpu().numpy(),
+        }
+        return (
+            colors,
+            pts3ds_self,
+            pts3ds_other,
+            conf_self,
+            conf_other,
+            cam_dict,
+            pr_poses,
+        )
+
+    model = ARCroco3DStereo.from_pretrained(args.weights)
+    eval_pose_estimation(args, model, save_dir=args.output_dir)
diff --git a/extern/CUT3R/eval/relpose/metadata.py b/extern/CUT3R/eval/relpose/metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5d97c5e21c4671a705e311ecf9bb2f2a8c1e516
--- /dev/null
+++ b/extern/CUT3R/eval/relpose/metadata.py
@@ -0,0 +1,233 @@
+import os
+import glob
+from tqdm import tqdm
+
+# Define the merged dataset metadata dictionary
+dataset_metadata = {
+    "davis": {
+        "img_path": "data/davis/DAVIS/JPEGImages/480p",
+        "mask_path": "data/davis/DAVIS/masked_images/480p",
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq),
+        "gt_traj_func": lambda img_path, anno_path, seq: None,
+        "traj_format": None,
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: os.path.join(mask_path, seq),
+        "skip_condition": None,
+        "process_func": None,  # Not used in mono depth estimation
+    },
+    "kitti": {
+        "img_path": "data/kitti/depth_selection/val_selection_cropped/image_gathered",  # Default path
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq),
+        "gt_traj_func": lambda img_path, anno_path, seq: None,
+        "traj_format": None,
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": lambda args, img_path: process_kitti(args, img_path),
+    },
+    "bonn": {
+        "img_path": "data/bonn/rgbd_bonn_dataset",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(
+            img_path, f"rgbd_bonn_{seq}", "rgb_110"
+        ),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, f"rgbd_bonn_{seq}", "groundtruth_110.txt"
+        ),
+        "traj_format": "tum",
+        "seq_list": ["balloon2", "crowd2", "crowd3", "person_tracking2", "synchronous"],
+        "full_seq": False,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": lambda args, img_path: process_bonn(args, img_path),
+    },
+    "nyu": {
+        "img_path": "data/nyu-v2/val/nyu_images",
+        "mask_path": None,
+        "process_func": lambda args, img_path: process_nyu(args, img_path),
+    },
+    "scannet": {
+        "img_path": "data/scannetv2",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq, "color_90"),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, seq, "pose_90.txt"
+        ),
+        "traj_format": "replica",
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,  # lambda save_dir, seq: os.path.exists(os.path.join(save_dir, seq)),
+        "process_func": lambda args, img_path: process_scannet(args, img_path),
+    },
+    "scannet-257": {
+        "img_path": "data/scannetv2_3_257",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq, "color_90"),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, seq, "pose_90.txt"
+        ),
+        "traj_format": "replica",
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,  # lambda save_dir, seq: os.path.exists(os.path.join(save_dir, seq)),
+        "process_func": lambda args, img_path: process_scannet(args, img_path),
+    },
+    "scannet-129": {
+        "img_path": "data/scannetv2_3_129",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq, "color_90"),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, seq, "pose_90.txt"
+        ),
+        "traj_format": "replica",
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,  # lambda save_dir, seq: os.path.exists(os.path.join(save_dir, seq)),
+        "process_func": lambda args, img_path: process_scannet(args, img_path),
+    },
+    "scannet-65": {
+        "img_path": "data/scannetv2_3_65",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq, "color_90"),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, seq, "pose_90.txt"
+        ),
+        "traj_format": "replica",
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,  # lambda save_dir, seq: os.path.exists(os.path.join(save_dir, seq)),
+        "process_func": lambda args, img_path: process_scannet(args, img_path),
+    },
+    "scannet-33": {
+        "img_path": "data/scannetv2_3_33",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq, "color_90"),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, seq, "pose_90.txt"
+        ),
+        "traj_format": "replica",
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,  # lambda save_dir, seq: os.path.exists(os.path.join(save_dir, seq)),
+        "process_func": lambda args, img_path: process_scannet(args, img_path),
+    },
+    "tum": {
+        "img_path": "data/tum",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq, "rgb_90"),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, seq, "groundtruth_90.txt"
+        ),
+        "traj_format": "tum",
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": None,
+    },
+    "sintel": {
+        "img_path": "data/sintel/training/final",
+        "anno_path": "data/sintel/training/camdata_left",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(anno_path, seq),
+        "traj_format": None,
+        "seq_list": [
+            "alley_2",
+            "ambush_4",
+            "ambush_5",
+            "ambush_6",
+            "cave_2",
+            "cave_4",
+            "market_2",
+            "market_5",
+            "market_6",
+            "shaman_3",
+            "sleeping_1",
+            "sleeping_2",
+            "temple_2",
+            "temple_3",
+        ],
+        "full_seq": False,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": lambda args, img_path: process_sintel(args, img_path),
+    },
+}
+
+
+# Define processing functions for each dataset
+def process_kitti(args, img_path):
+    for dir in tqdm(sorted(glob.glob(f"{img_path}/*"))):
+        filelist = sorted(glob.glob(f"{dir}/*.png"))
+        save_dir = f"{args.output_dir}/{os.path.basename(dir)}"
+        yield filelist, save_dir
+
+
+def process_bonn(args, img_path):
+    if args.full_seq:
+        for dir in tqdm(sorted(glob.glob(f"{img_path}/*/"))):
+            filelist = sorted(glob.glob(f"{dir}/rgb/*.png"))
+            save_dir = f"{args.output_dir}/{os.path.basename(os.path.dirname(dir))}"
+            yield filelist, save_dir
+    else:
+        seq_list = (
+            ["balloon2", "crowd2", "crowd3", "person_tracking2", "synchronous"]
+            if args.seq_list is None
+            else args.seq_list
+        )
+        for seq in tqdm(seq_list):
+            filelist = sorted(glob.glob(f"{img_path}/rgbd_bonn_{seq}/rgb_110/*.png"))
+            save_dir = f"{args.output_dir}/{seq}"
+            yield filelist, save_dir
+
+
+def process_nyu(args, img_path):
+    filelist = sorted(glob.glob(f"{img_path}/*.png"))
+    save_dir = f"{args.output_dir}"
+    yield filelist, save_dir
+
+
+def process_scannet(args, img_path):
+    seq_list = sorted(glob.glob(f"{img_path}/*"))
+    for seq in tqdm(seq_list):
+        filelist = sorted(glob.glob(f"{seq}/color_90/*.jpg"))
+        save_dir = f"{args.output_dir}/{os.path.basename(seq)}"
+        yield filelist, save_dir
+
+
+def process_sintel(args, img_path):
+    if args.full_seq:
+        for dir in tqdm(sorted(glob.glob(f"{img_path}/*/"))):
+            filelist = sorted(glob.glob(f"{dir}/*.png"))
+            save_dir = f"{args.output_dir}/{os.path.basename(os.path.dirname(dir))}"
+            yield filelist, save_dir
+    else:
+        seq_list = [
+            "alley_2",
+            "ambush_4",
+            "ambush_5",
+            "ambush_6",
+            "cave_2",
+            "cave_4",
+            "market_2",
+            "market_5",
+            "market_6",
+            "shaman_3",
+            "sleeping_1",
+            "sleeping_2",
+            "temple_2",
+            "temple_3",
+        ]
+        for seq in tqdm(seq_list):
+            filelist = sorted(glob.glob(f"{img_path}/{seq}/*.png"))
+            save_dir = f"{args.output_dir}/{seq}"
+            yield filelist, save_dir
diff --git a/extern/CUT3R/eval/relpose/run.sh b/extern/CUT3R/eval/relpose/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..be7673edc98b8ae1fcc4e58b6353caa8432b4687
--- /dev/null
+++ b/extern/CUT3R/eval/relpose/run.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -e
+
+workdir='.'
+model_name='ours'
+ckpt_name='cut3r_512_dpt_4_64'
+model_weights="${workdir}/src/${ckpt_name}.pth"
+datasets=('scannet' 'tum' 'sintel')
+
+
+for data in "${datasets[@]}"; do
+    output_dir="${workdir}/eval_results/relpose/${data}_${model_name}"
+    echo "$output_dir"
+    accelerate launch --num_processes 8 --main_process_port 29558 eval/relpose/launch.py \
+        --weights "$model_weights" \
+        --output_dir "$output_dir" \
+        --eval_dataset "$data" \
+        --size 512
+done
+
+
diff --git a/extern/CUT3R/eval/relpose/utils.py b/extern/CUT3R/eval/relpose/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..14861ddeb972062eaed66ddbb5de152052079d91
--- /dev/null
+++ b/extern/CUT3R/eval/relpose/utils.py
@@ -0,0 +1,311 @@
+from copy import deepcopy
+import cv2
+
+import numpy as np
+import torch
+import torch.nn as nn
+import roma
+from copy import deepcopy
+import tqdm
+import matplotlib as mpl
+import matplotlib.cm as cm
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from scipy.spatial.transform import Rotation
+from eval.relpose.evo_utils import *
+from PIL import Image
+import imageio.v2 as iio
+from matplotlib.figure import Figure
+
+# from checkpoints.dust3r.viz import colorize_np, colorize
+
+
+def todevice(batch, device, callback=None, non_blocking=False):
+    """Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy).
+
+    batch: list, tuple, dict of tensors or other things
+    device: pytorch device or 'numpy'
+    callback: function that would be called on every sub-elements.
+    """
+    if callback:
+        batch = callback(batch)
+
+    if isinstance(batch, dict):
+        return {k: todevice(v, device) for k, v in batch.items()}
+
+    if isinstance(batch, (tuple, list)):
+        return type(batch)(todevice(x, device) for x in batch)
+
+    x = batch
+    if device == "numpy":
+        if isinstance(x, torch.Tensor):
+            x = x.detach().cpu().numpy()
+    elif x is not None:
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x)
+        if torch.is_tensor(x):
+            x = x.to(device, non_blocking=non_blocking)
+    return x
+
+
+to_device = todevice  # alias
+
+
+def to_numpy(x):
+    return todevice(x, "numpy")
+
+
+def c2w_to_tumpose(c2w):
+    """
+    Convert a camera-to-world matrix to a tuple of translation and rotation
+
+    input: c2w: 4x4 matrix
+    output: tuple of translation and rotation (x y z qw qx qy qz)
+    """
+    # convert input to numpy
+    c2w = to_numpy(c2w)
+    xyz = c2w[:3, -1]
+    rot = Rotation.from_matrix(c2w[:3, :3])
+    qx, qy, qz, qw = rot.as_quat()
+    tum_pose = np.concatenate([xyz, [qw, qx, qy, qz]])
+    return tum_pose
+
+
+def get_tum_poses(poses):
+    """
+    poses: list of 4x4 arrays
+    """
+    tt = np.arange(len(poses)).astype(float)
+    tum_poses = [c2w_to_tumpose(p) for p in poses]
+    tum_poses = np.stack(tum_poses, 0)
+    return [tum_poses, tt]
+
+
+def save_tum_poses(poses, path):
+    traj = get_tum_poses(poses)
+    save_trajectory_tum_format(traj, path)
+    return traj[0]  # return the poses
+
+
+def save_focals(cam_dict, path):
+    # convert focal to txt
+    focals = cam_dict["focal"]
+    np.savetxt(path, focals, fmt="%.6f")
+    return focals
+
+
+def save_intrinsics(cam_dict, path):
+    K_raw = np.eye(3)[None].repeat(len(cam_dict["focal"]), axis=0)
+    K_raw[:, 0, 0] = cam_dict["focal"]
+    K_raw[:, 1, 1] = cam_dict["focal"]
+    K_raw[:, :2, 2] = cam_dict["pp"]
+    K = K_raw.reshape(-1, 9)
+    np.savetxt(path, K, fmt="%.6f")
+    return K_raw
+
+
+def save_conf_maps(conf, path):
+    for i, c in enumerate(conf):
+        np.save(f"{path}/conf_{i}.npy", c.detach().cpu().numpy())
+    return conf
+
+
+def save_rgb_imgs(colors, path):
+    imgs = colors
+    for i, img in enumerate(imgs):
+        # convert from rgb to bgr
+        iio.imwrite(
+            f"{path}/frame_{i:04d}.jpg", (img.cpu().numpy() * 255).astype(np.uint8)
+        )
+    return imgs
+
+
+def save_depth_maps(pts3ds_self, path, conf_self=None):
+    depth_maps = torch.stack([pts3d_self[..., -1] for pts3d_self in pts3ds_self], 0)
+    min_depth = depth_maps.min()  # float(torch.quantile(out, 0.01))
+    max_depth = depth_maps.max()  # float(torch.quantile(out, 0.99))
+    colored_depth = colorize(
+        depth_maps,
+        cmap_name="Spectral_r",
+        range=(min_depth, max_depth),
+        append_cbar=True,
+    )
+    images = []
+
+    if conf_self is not None:
+        conf_selfs = torch.concat(conf_self, 0)
+        min_conf = torch.log(conf_selfs.min())  # float(torch.quantile(out, 0.01))
+        max_conf = torch.log(conf_selfs.max())  # float(torch.quantile(out, 0.99))
+        colored_conf = colorize(
+            torch.log(conf_selfs),
+            cmap_name="jet",
+            range=(min_conf, max_conf),
+            append_cbar=True,
+        )
+
+    for i, depth_map in enumerate(colored_depth):
+        # Apply color map to depth map
+        img_path = f"{path}/frame_{(i):04d}.png"
+        if conf_self is None:
+            to_save = (depth_map * 255).detach().cpu().numpy().astype(np.uint8)
+        else:
+            to_save = torch.cat([depth_map, colored_conf[i]], dim=1)
+            to_save = (to_save * 255).detach().cpu().numpy().astype(np.uint8)
+        iio.imwrite(img_path, to_save)
+        images.append(Image.open(img_path))
+        np.save(f"{path}/frame_{(i):04d}.npy", depth_maps[i].detach().cpu().numpy())
+
+    images[0].save(
+        f"{path}/_depth_maps.gif",
+        save_all=True,
+        append_images=images[1:],
+        duration=100,
+        loop=0,
+    )
+
+    return depth_maps
+
+
+def get_vertical_colorbar(h, vmin, vmax, cmap_name="jet", label=None, cbar_precision=2):
+    """
+    :param w: pixels
+    :param h: pixels
+    :param vmin: min value
+    :param vmax: max value
+    :param cmap_name:
+    :param label
+    :return:
+    """
+    fig = Figure(figsize=(2, 8), dpi=100)
+    fig.subplots_adjust(right=1.5)
+    canvas = FigureCanvasAgg(fig)
+
+    # Do some plotting.
+    ax = fig.add_subplot(111)
+    cmap = cm.get_cmap(cmap_name)
+    norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
+
+    tick_cnt = 6
+    tick_loc = np.linspace(vmin, vmax, tick_cnt)
+    cb1 = mpl.colorbar.ColorbarBase(
+        ax, cmap=cmap, norm=norm, ticks=tick_loc, orientation="vertical"
+    )
+
+    tick_label = [str(np.round(x, cbar_precision)) for x in tick_loc]
+    if cbar_precision == 0:
+        tick_label = [x[:-2] for x in tick_label]
+
+    cb1.set_ticklabels(tick_label)
+
+    cb1.ax.tick_params(labelsize=18, rotation=0)
+    if label is not None:
+        cb1.set_label(label)
+
+    # fig.tight_layout()
+
+    canvas.draw()
+    s, (width, height) = canvas.print_to_buffer()
+
+    im = np.frombuffer(s, np.uint8).reshape((height, width, 4))
+
+    im = im[:, :, :3].astype(np.float32) / 255.0
+    if h != im.shape[0]:
+        w = int(im.shape[1] / im.shape[0] * h)
+        im = cv2.resize(im, (w, h), interpolation=cv2.INTER_AREA)
+
+    return im
+
+
+def colorize_np(
+    x,
+    cmap_name="jet",
+    mask=None,
+    range=None,
+    append_cbar=False,
+    cbar_in_image=False,
+    cbar_precision=2,
+):
+    """
+    turn a grayscale image into a color image
+    :param x: input grayscale, [H, W]
+    :param cmap_name: the colorization method
+    :param mask: the mask image, [H, W]
+    :param range: the range for scaling, automatic if None, [min, max]
+    :param append_cbar: if append the color bar
+    :param cbar_in_image: put the color bar inside the image to keep the output image the same size as the input image
+    :return: colorized image, [H, W]
+    """
+    if range is not None:
+        vmin, vmax = range
+    elif mask is not None:
+        # vmin, vmax = np.percentile(x[mask], (2, 100))
+        vmin = np.min(x[mask][np.nonzero(x[mask])])
+        vmax = np.max(x[mask])
+        # vmin = vmin - np.abs(vmin) * 0.01
+        x[np.logical_not(mask)] = vmin
+        # print(vmin, vmax)
+    else:
+        vmin, vmax = np.percentile(x, (1, 100))
+        vmax += 1e-6
+
+    x = np.clip(x, vmin, vmax)
+    x = (x - vmin) / (vmax - vmin)
+    # x = np.clip(x, 0., 1.)
+
+    cmap = cm.get_cmap(cmap_name)
+    x_new = cmap(x)[:, :, :3]
+
+    if mask is not None:
+        mask = np.float32(mask[:, :, np.newaxis])
+        x_new = x_new * mask + np.ones_like(x_new) * (1.0 - mask)
+
+    cbar = get_vertical_colorbar(
+        h=x.shape[0],
+        vmin=vmin,
+        vmax=vmax,
+        cmap_name=cmap_name,
+        cbar_precision=cbar_precision,
+    )
+
+    if append_cbar:
+        if cbar_in_image:
+            x_new[:, -cbar.shape[1] :, :] = cbar
+        else:
+            x_new = np.concatenate(
+                (x_new, np.zeros_like(x_new[:, :5, :]), cbar), axis=1
+            )
+        return x_new
+    else:
+        return x_new
+
+
+# tensor
+def colorize(
+    x, cmap_name="jet", mask=None, range=None, append_cbar=False, cbar_in_image=False
+):
+    """
+    turn a grayscale image into a color image
+    :param x: torch.Tensor, grayscale image, [H, W] or [B, H, W]
+    :param mask: torch.Tensor or None, mask image, [H, W] or [B, H, W] or None
+    """
+
+    device = x.device
+    x = x.cpu().numpy()
+    if mask is not None:
+        mask = mask.cpu().numpy() > 0.99
+        kernel = np.ones((3, 3), np.uint8)
+
+    if x.ndim == 2:
+        x = x[None]
+        if mask is not None:
+            mask = mask[None]
+
+    out = []
+    for x_ in x:
+        if mask is not None:
+            mask = cv2.erode(mask.astype(np.uint8), kernel, iterations=1).astype(bool)
+
+        x_ = colorize_np(x_, cmap_name, mask, range, append_cbar, cbar_in_image)
+        out.append(torch.from_numpy(x_).to(device).float())
+    out = torch.stack(out).squeeze(0)
+    return out
diff --git a/extern/CUT3R/eval/video_depth/eval_depth.py b/extern/CUT3R/eval/video_depth/eval_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb37e5795459491e5c3ffd404a665cbdf4313293
--- /dev/null
+++ b/extern/CUT3R/eval/video_depth/eval_depth.py
@@ -0,0 +1,385 @@
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+from eval.video_depth.tools import depth_evaluation, group_by_directory
+import numpy as np
+import cv2
+from tqdm import tqdm
+import glob
+from PIL import Image
+import argparse
+import json
+from eval.video_depth.metadata import dataset_metadata
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="",
+        help="value for outdir",
+    )
+    parser.add_argument(
+        "--eval_dataset", type=str, default="nyu", choices=list(dataset_metadata.keys())
+    )
+    parser.add_argument(
+        "--align",
+        type=str,
+        default="scale&shift",
+        choices=["scale&shift", "scale", "metric"],
+    )
+    return parser
+
+
+def main(args):
+    if args.eval_dataset == "sintel":
+        TAG_FLOAT = 202021.25
+
+        def depth_read(filename):
+            """Read depth data from file, return as numpy array."""
+            f = open(filename, "rb")
+            check = np.fromfile(f, dtype=np.float32, count=1)[0]
+            assert (
+                check == TAG_FLOAT
+            ), " depth_read:: Wrong tag in flow file (should be: {0}, is: {1}). Big-endian machine? ".format(
+                TAG_FLOAT, check
+            )
+            width = np.fromfile(f, dtype=np.int32, count=1)[0]
+            height = np.fromfile(f, dtype=np.int32, count=1)[0]
+            size = width * height
+            assert (
+                width > 0 and height > 0 and size > 1 and size < 100000000
+            ), " depth_read:: Wrong input size (width = {0}, height = {1}).".format(
+                width, height
+            )
+            depth = np.fromfile(f, dtype=np.float32, count=-1).reshape((height, width))
+            return depth
+
+        pred_pathes = glob.glob(
+            f"{args.output_dir}/*/frame_*.npy"
+        )  # TODO: update the path to your prediction
+        pred_pathes = sorted(pred_pathes)
+
+        if len(pred_pathes) > 643:
+            full = True
+        else:
+            full = False
+
+        if full:
+            depth_pathes = glob.glob(f"data/sintel/training/depth/*/*.dpt")
+            depth_pathes = sorted(depth_pathes)
+        else:
+            seq_list = [
+                "alley_2",
+                "ambush_4",
+                "ambush_5",
+                "ambush_6",
+                "cave_2",
+                "cave_4",
+                "market_2",
+                "market_5",
+                "market_6",
+                "shaman_3",
+                "sleeping_1",
+                "sleeping_2",
+                "temple_2",
+                "temple_3",
+            ]
+            depth_pathes_folder = [
+                f"data/sintel/training/depth/{seq}" for seq in seq_list
+            ]
+            depth_pathes = []
+            for depth_pathes_folder_i in depth_pathes_folder:
+                depth_pathes += glob.glob(depth_pathes_folder_i + "/*.dpt")
+            depth_pathes = sorted(depth_pathes)
+
+        def get_video_results():
+            grouped_pred_depth = group_by_directory(pred_pathes)
+
+            grouped_gt_depth = group_by_directory(depth_pathes)
+            gathered_depth_metrics = []
+
+            for key in tqdm(grouped_pred_depth.keys()):
+                pd_pathes = grouped_pred_depth[key]
+                gt_pathes = grouped_gt_depth[key.replace("_pred_depth", "")]
+
+                gt_depth = np.stack(
+                    [depth_read(gt_path) for gt_path in gt_pathes], axis=0
+                )
+                pr_depth = np.stack(
+                    [
+                        cv2.resize(
+                            np.load(pd_path),
+                            (gt_depth.shape[2], gt_depth.shape[1]),
+                            interpolation=cv2.INTER_CUBIC,
+                        )
+                        for pd_path in pd_pathes
+                    ],
+                    axis=0,
+                )
+                # for depth eval, set align_with_lad2=False to use median alignment; set align_with_lad2=True to use scale&shift alignment
+                if args.align == "scale&shift":
+                    depth_results, error_map, depth_predict, depth_gt = (
+                        depth_evaluation(
+                            pr_depth,
+                            gt_depth,
+                            max_depth=70,
+                            align_with_lad2=True,
+                            use_gpu=True,
+                            post_clip_max=70,
+                        )
+                    )
+                elif args.align == "scale":
+                    depth_results, error_map, depth_predict, depth_gt = (
+                        depth_evaluation(
+                            pr_depth,
+                            gt_depth,
+                            max_depth=70,
+                            align_with_scale=True,
+                            use_gpu=True,
+                            post_clip_max=70,
+                        )
+                    )
+                elif args.align == "metric":
+                    depth_results, error_map, depth_predict, depth_gt = (
+                        depth_evaluation(
+                            pr_depth,
+                            gt_depth,
+                            max_depth=70,
+                            metric_scale=True,
+                            use_gpu=True,
+                            post_clip_max=70,
+                        )
+                    )
+                gathered_depth_metrics.append(depth_results)
+
+            depth_log_path = f"{args.output_dir}/result_{args.align}.json"
+            average_metrics = {
+                key: np.average(
+                    [metrics[key] for metrics in gathered_depth_metrics],
+                    weights=[
+                        metrics["valid_pixels"] for metrics in gathered_depth_metrics
+                    ],
+                )
+                for key in gathered_depth_metrics[0].keys()
+                if key != "valid_pixels"
+            }
+            print("Average depth evaluation metrics:", average_metrics)
+            with open(depth_log_path, "w") as f:
+                f.write(json.dumps(average_metrics))
+
+        get_video_results()
+    elif args.eval_dataset == "bonn":
+
+        def depth_read(filename):
+            # loads depth map D from png file
+            # and returns it as a numpy array
+            depth_png = np.asarray(Image.open(filename))
+            # make sure we have a proper 16bit depth map here.. not 8bit!
+            assert np.max(depth_png) > 255
+            depth = depth_png.astype(np.float64) / 5000.0
+            depth[depth_png == 0] = -1.0
+            return depth
+
+        seq_list = ["balloon2", "crowd2", "crowd3", "person_tracking2", "synchronous"]
+
+        img_pathes_folder = [
+            f"data/bonn/rgbd_bonn_dataset/rgbd_bonn_{seq}/rgb_110/*.png"
+            for seq in seq_list
+        ]
+        img_pathes = []
+        for img_pathes_folder_i in img_pathes_folder:
+            img_pathes += glob.glob(img_pathes_folder_i)
+        img_pathes = sorted(img_pathes)
+        depth_pathes_folder = [
+            f"data/bonn/rgbd_bonn_dataset/rgbd_bonn_{seq}/depth_110/*.png"
+            for seq in seq_list
+        ]
+        depth_pathes = []
+        for depth_pathes_folder_i in depth_pathes_folder:
+            depth_pathes += glob.glob(depth_pathes_folder_i)
+        depth_pathes = sorted(depth_pathes)
+        pred_pathes = glob.glob(
+            f"{args.output_dir}/*/frame*.npy"
+        )  # TODO: update the path to your prediction
+        pred_pathes = sorted(pred_pathes)
+
+        def get_video_results():
+            grouped_pred_depth = group_by_directory(pred_pathes)
+            grouped_gt_depth = group_by_directory(depth_pathes, idx=-2)
+            gathered_depth_metrics = []
+            for key in tqdm(grouped_gt_depth.keys()):
+                pd_pathes = grouped_pred_depth[key[10:]]
+                gt_pathes = grouped_gt_depth[key]
+                gt_depth = np.stack(
+                    [depth_read(gt_path) for gt_path in gt_pathes], axis=0
+                )
+                pr_depth = np.stack(
+                    [
+                        cv2.resize(
+                            np.load(pd_path),
+                            (gt_depth.shape[2], gt_depth.shape[1]),
+                            interpolation=cv2.INTER_CUBIC,
+                        )
+                        for pd_path in pd_pathes
+                    ],
+                    axis=0,
+                )
+                # for depth eval, set align_with_lad2=False to use median alignment; set align_with_lad2=True to use scale&shift alignment
+                if args.align == "scale&shift":
+                    depth_results, error_map, depth_predict, depth_gt = (
+                        depth_evaluation(
+                            pr_depth,
+                            gt_depth,
+                            max_depth=70,
+                            align_with_lad2=True,
+                            use_gpu=True,
+                        )
+                    )
+                elif args.align == "scale":
+                    depth_results, error_map, depth_predict, depth_gt = (
+                        depth_evaluation(
+                            pr_depth,
+                            gt_depth,
+                            max_depth=70,
+                            align_with_scale=True,
+                            use_gpu=True,
+                        )
+                    )
+                elif args.align == "metric":
+                    depth_results, error_map, depth_predict, depth_gt = (
+                        depth_evaluation(
+                            pr_depth,
+                            gt_depth,
+                            max_depth=70,
+                            metric_scale=True,
+                            use_gpu=True,
+                        )
+                    )
+                gathered_depth_metrics.append(depth_results)
+
+                # seq_len = gt_depth.shape[0]
+                # error_map = error_map.reshape(seq_len, -1, error_map.shape[-1]).cpu()
+                # error_map_colored = colorize(error_map, range=(error_map.min(), error_map.max()), append_cbar=True)
+                # ImageSequenceClip([x for x in (error_map_colored.numpy()*255).astype(np.uint8)], fps=10).write_videofile(f'{args.output_dir}/errormap_{key}_{args.align}.mp4', fps=10)
+
+            depth_log_path = f"{args.output_dir}/result_{args.align}.json"
+            average_metrics = {
+                key: np.average(
+                    [metrics[key] for metrics in gathered_depth_metrics],
+                    weights=[
+                        metrics["valid_pixels"] for metrics in gathered_depth_metrics
+                    ],
+                )
+                for key in gathered_depth_metrics[0].keys()
+                if key != "valid_pixels"
+            }
+            print("Average depth evaluation metrics:", average_metrics)
+            with open(depth_log_path, "w") as f:
+                f.write(json.dumps(average_metrics))
+
+        get_video_results()
+    elif args.eval_dataset == "kitti":
+
+        def depth_read(filename):
+            # loads depth map D from png file
+            # and returns it as a numpy array,
+            # for details see readme.txt
+            img_pil = Image.open(filename)
+            depth_png = np.array(img_pil, dtype=int)
+            # make sure we have a proper 16bit depth map here.. not 8bit!
+            assert np.max(depth_png) > 255
+
+            depth = depth_png.astype(float) / 256.0
+            depth[depth_png == 0] = -1.0
+            return depth
+
+        depth_pathes = glob.glob(
+            "data/kitti/depth_selection/val_selection_cropped/groundtruth_depth_gathered/*/*.png"
+        )
+        depth_pathes = sorted(depth_pathes)
+        pred_pathes = glob.glob(
+            f"{args.output_dir}/*/frame_*.npy"
+        )  # TODO: update the path to your prediction
+        pred_pathes = sorted(pred_pathes)
+
+        def get_video_results():
+            grouped_pred_depth = group_by_directory(pred_pathes)
+            grouped_gt_depth = group_by_directory(depth_pathes)
+            gathered_depth_metrics = []
+            for key in tqdm(grouped_pred_depth.keys()):
+                pd_pathes = grouped_pred_depth[key]
+                gt_pathes = grouped_gt_depth[key]
+                gt_depth = np.stack(
+                    [depth_read(gt_path) for gt_path in gt_pathes], axis=0
+                )
+                pr_depth = np.stack(
+                    [
+                        cv2.resize(
+                            np.load(pd_path),
+                            (gt_depth.shape[2], gt_depth.shape[1]),
+                            interpolation=cv2.INTER_CUBIC,
+                        )
+                        for pd_path in pd_pathes
+                    ],
+                    axis=0,
+                )
+
+                # for depth eval, set align_with_lad2=False to use median alignment; set align_with_lad2=True to use scale&shift alignment
+                if args.align == "scale&shift":
+                    depth_results, error_map, depth_predict, depth_gt = (
+                        depth_evaluation(
+                            pr_depth,
+                            gt_depth,
+                            max_depth=None,
+                            align_with_lad2=True,
+                            use_gpu=True,
+                        )
+                    )
+                elif args.align == "scale":
+                    depth_results, error_map, depth_predict, depth_gt = (
+                        depth_evaluation(
+                            pr_depth,
+                            gt_depth,
+                            max_depth=None,
+                            align_with_scale=True,
+                            use_gpu=True,
+                        )
+                    )
+                elif args.align == "metric":
+                    depth_results, error_map, depth_predict, depth_gt = (
+                        depth_evaluation(
+                            pr_depth,
+                            gt_depth,
+                            max_depth=None,
+                            metric_scale=True,
+                            use_gpu=True,
+                        )
+                    )
+                gathered_depth_metrics.append(depth_results)
+
+            depth_log_path = f"{args.output_dir}/result_{args.align}.json"
+            average_metrics = {
+                key: np.average(
+                    [metrics[key] for metrics in gathered_depth_metrics],
+                    weights=[
+                        metrics["valid_pixels"] for metrics in gathered_depth_metrics
+                    ],
+                )
+                for key in gathered_depth_metrics[0].keys()
+                if key != "valid_pixels"
+            }
+            print("Average depth evaluation metrics:", average_metrics)
+            with open(depth_log_path, "w") as f:
+                f.write(json.dumps(average_metrics))
+
+        get_video_results()
+
+
+if __name__ == "__main__":
+    args = get_args_parser()
+    args = args.parse_args()
+    main(args)
diff --git a/extern/CUT3R/eval/video_depth/launch.py b/extern/CUT3R/eval/video_depth/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..93a3294c5cd976be5ae6d27d7bc68d79fa33f0d7
--- /dev/null
+++ b/extern/CUT3R/eval/video_depth/launch.py
@@ -0,0 +1,331 @@
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+import math
+import cv2
+import numpy as np
+import torch
+import argparse
+
+from copy import deepcopy
+from eval.video_depth.metadata import dataset_metadata
+from eval.video_depth.utils import save_depth_maps
+from accelerate import PartialState
+from add_ckpt_path import add_path_to_dust3r
+import time
+from tqdm import tqdm
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--weights",
+        type=str,
+        help="path to the model weights",
+        default="",
+    )
+
+    parser.add_argument("--device", type=str, default="cuda", help="pytorch device")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="",
+        help="value for outdir",
+    )
+    parser.add_argument(
+        "--no_crop", type=bool, default=True, help="whether to crop input data"
+    )
+
+    parser.add_argument(
+        "--eval_dataset",
+        type=str,
+        default="sintel",
+        choices=list(dataset_metadata.keys()),
+    )
+    parser.add_argument("--size", type=int, default="224")
+
+    parser.add_argument(
+        "--pose_eval_stride", default=1, type=int, help="stride for pose evaluation"
+    )
+    parser.add_argument(
+        "--full_seq",
+        action="store_true",
+        default=False,
+        help="use full sequence for pose evaluation",
+    )
+    parser.add_argument(
+        "--seq_list",
+        nargs="+",
+        default=None,
+        help="list of sequences for pose evaluation",
+    )
+    return parser
+
+
+def eval_pose_estimation(args, model, save_dir=None):
+    metadata = dataset_metadata.get(args.eval_dataset)
+    img_path = metadata["img_path"]
+    mask_path = metadata["mask_path"]
+
+    ate_mean, rpe_trans_mean, rpe_rot_mean = eval_pose_estimation_dist(
+        args, model, save_dir=save_dir, img_path=img_path, mask_path=mask_path
+    )
+    return ate_mean, rpe_trans_mean, rpe_rot_mean
+
+
+def eval_pose_estimation_dist(args, model, img_path, save_dir=None, mask_path=None):
+    from dust3r.inference import inference
+
+    metadata = dataset_metadata.get(args.eval_dataset)
+    anno_path = metadata.get("anno_path", None)
+
+    seq_list = args.seq_list
+    if seq_list is None:
+        if metadata.get("full_seq", False):
+            args.full_seq = True
+        else:
+            seq_list = metadata.get("seq_list", [])
+        if args.full_seq:
+            seq_list = os.listdir(img_path)
+            seq_list = [
+                seq for seq in seq_list if os.path.isdir(os.path.join(img_path, seq))
+            ]
+        seq_list = sorted(seq_list)
+
+    if save_dir is None:
+        save_dir = args.output_dir
+
+    distributed_state = PartialState()
+    model.to(distributed_state.device)
+    device = distributed_state.device
+
+    with distributed_state.split_between_processes(seq_list) as seqs:
+        ate_list = []
+        rpe_trans_list = []
+        rpe_rot_list = []
+        load_img_size = args.size
+        assert load_img_size == 512
+        error_log_path = f"{save_dir}/_error_log_{distributed_state.process_index}.txt"  # Unique log file per process
+        bug = False
+        for seq in tqdm(seqs):
+            try:
+                dir_path = metadata["dir_path_func"](img_path, seq)
+
+                # Handle skip_condition
+                skip_condition = metadata.get("skip_condition", None)
+                if skip_condition is not None and skip_condition(save_dir, seq):
+                    continue
+
+                mask_path_seq_func = metadata.get(
+                    "mask_path_seq_func", lambda mask_path, seq: None
+                )
+                mask_path_seq = mask_path_seq_func(mask_path, seq)
+
+                filelist = [
+                    os.path.join(dir_path, name) for name in os.listdir(dir_path)
+                ]
+                filelist.sort()
+                filelist = filelist[:: args.pose_eval_stride]
+
+                views = prepare_input(
+                    filelist,
+                    [True for _ in filelist],
+                    size=load_img_size,
+                    crop=not args.no_crop,
+                )
+                start = time.time()
+                outputs, _ = inference(views, model, device)
+                end = time.time()
+                fps = len(filelist) / (end - start)
+
+                (
+                    colors,
+                    pts3ds_self,
+                    pts3ds_other,
+                    conf_self,
+                    conf_other,
+                    cam_dict,
+                    pr_poses,
+                ) = prepare_output(outputs)
+
+                os.makedirs(f"{save_dir}/{seq}", exist_ok=True)
+                save_depth_maps(pts3ds_self, f"{save_dir}/{seq}", conf_self=conf_self)
+
+            except Exception as e:
+                if "out of memory" in str(e):
+                    # Handle OOM
+                    torch.cuda.empty_cache()  # Clear the CUDA memory
+                    with open(error_log_path, "a") as f:
+                        f.write(
+                            f"OOM error in sequence {seq}, skipping this sequence.\n"
+                        )
+                    print(f"OOM error in sequence {seq}, skipping...")
+                elif "Degenerate covariance rank" in str(
+                    e
+                ) or "Eigenvalues did not converge" in str(e):
+                    # Handle Degenerate covariance rank exception and Eigenvalues did not converge exception
+                    with open(error_log_path, "a") as f:
+                        f.write(f"Exception in sequence {seq}: {str(e)}\n")
+                    print(f"Traj evaluation error in sequence {seq}, skipping.")
+                else:
+                    raise e  # Rethrow if it's not an expected exception
+    return None, None, None
+
+
+if __name__ == "__main__":
+    args = get_args_parser()
+    args = args.parse_args()
+    add_path_to_dust3r(args.weights)
+    from dust3r.utils.image import load_images_for_eval as load_images
+    from dust3r.post_process import estimate_focal_knowing_depth
+    from dust3r.model import ARCroco3DStereo
+    from dust3r.utils.camera import pose_encoding_to_camera
+
+    if args.eval_dataset == "sintel":
+        args.full_seq = True
+    else:
+        args.full_seq = False
+    args.no_crop = True
+
+    def prepare_input(
+        img_paths,
+        img_mask,
+        size,
+        raymaps=None,
+        raymap_mask=None,
+        revisit=1,
+        update=True,
+        crop=True,
+    ):
+        images = load_images(img_paths, size=size, crop=crop)
+        views = []
+        if raymaps is None and raymap_mask is None:
+            num_views = len(images)
+
+            for i in range(num_views):
+                view = {
+                    "img": images[i]["img"],
+                    "ray_map": torch.full(
+                        (
+                            images[i]["img"].shape[0],
+                            6,
+                            images[i]["img"].shape[-2],
+                            images[i]["img"].shape[-1],
+                        ),
+                        torch.nan,
+                    ),
+                    "true_shape": torch.from_numpy(images[i]["true_shape"]),
+                    "idx": i,
+                    "instance": str(i),
+                    "camera_pose": torch.from_numpy(
+                        np.eye(4).astype(np.float32)
+                    ).unsqueeze(0),
+                    "img_mask": torch.tensor(True).unsqueeze(0),
+                    "ray_mask": torch.tensor(False).unsqueeze(0),
+                    "update": torch.tensor(True).unsqueeze(0),
+                    "reset": torch.tensor(False).unsqueeze(0),
+                }
+                views.append(view)
+        else:
+
+            num_views = len(images) + len(raymaps)
+            assert len(img_mask) == len(raymap_mask) == num_views
+            assert sum(img_mask) == len(images) and sum(raymap_mask) == len(raymaps)
+
+            j = 0
+            k = 0
+            for i in range(num_views):
+                view = {
+                    "img": (
+                        images[j]["img"]
+                        if img_mask[i]
+                        else torch.full_like(images[0]["img"], torch.nan)
+                    ),
+                    "ray_map": (
+                        raymaps[k]
+                        if raymap_mask[i]
+                        else torch.full_like(raymaps[0], torch.nan)
+                    ),
+                    "true_shape": (
+                        torch.from_numpy(images[j]["true_shape"])
+                        if img_mask[i]
+                        else torch.from_numpy(np.int32([raymaps[k].shape[1:-1][::-1]]))
+                    ),
+                    "idx": i,
+                    "instance": str(i),
+                    "camera_pose": torch.from_numpy(
+                        np.eye(4).astype(np.float32)
+                    ).unsqueeze(0),
+                    "img_mask": torch.tensor(img_mask[i]).unsqueeze(0),
+                    "ray_mask": torch.tensor(raymap_mask[i]).unsqueeze(0),
+                    "update": torch.tensor(img_mask[i]).unsqueeze(0),
+                    "reset": torch.tensor(False).unsqueeze(0),
+                }
+                if img_mask[i]:
+                    j += 1
+                if raymap_mask[i]:
+                    k += 1
+                views.append(view)
+            assert j == len(images) and k == len(raymaps)
+
+        if revisit > 1:
+            # repeat input for 'revisit' times
+            new_views = []
+            for r in range(revisit):
+                for i in range(len(views)):
+                    new_view = deepcopy(views[i])
+                    new_view["idx"] = r * len(views) + i
+                    new_view["instance"] = str(r * len(views) + i)
+                    if r > 0:
+                        if not update:
+                            new_view["update"] = torch.tensor(False).unsqueeze(0)
+                    new_views.append(new_view)
+            return new_views
+        return views
+
+    def prepare_output(outputs, revisit=1):
+        valid_length = len(outputs["pred"]) // revisit
+        outputs["pred"] = outputs["pred"][-valid_length:]
+        outputs["views"] = outputs["views"][-valid_length:]
+
+        pts3ds_self = [output["pts3d_in_self_view"].cpu() for output in outputs["pred"]]
+        pts3ds_other = [
+            output["pts3d_in_other_view"].cpu() for output in outputs["pred"]
+        ]
+        conf_self = [output["conf_self"].cpu() for output in outputs["pred"]]
+        conf_other = [output["conf"].cpu() for output in outputs["pred"]]
+        pts3ds_self = torch.cat(pts3ds_self, 0)
+        pr_poses = [
+            pose_encoding_to_camera(pred["camera_pose"].clone()).cpu()
+            for pred in outputs["pred"]
+        ]
+        pr_poses = torch.cat(pr_poses, 0)
+
+        B, H, W, _ = pts3ds_self.shape
+        pp = (
+            torch.tensor([W // 2, H // 2], device=pts3ds_self.device)
+            .float()
+            .repeat(B, 1)
+            .reshape(B, 2)
+        )
+        focal = estimate_focal_knowing_depth(pts3ds_self, pp, focal_mode="weiszfeld")
+
+        colors = [0.5 * (output["rgb"][0] + 1.0) for output in outputs["pred"]]
+        cam_dict = {
+            "focal": focal.cpu().numpy(),
+            "pp": pp.cpu().numpy(),
+        }
+        return (
+            colors,
+            pts3ds_self,
+            pts3ds_other,
+            conf_self,
+            conf_other,
+            cam_dict,
+            pr_poses,
+        )
+
+    model = ARCroco3DStereo.from_pretrained(args.weights)
+    eval_pose_estimation(args, model, save_dir=args.output_dir)
diff --git a/extern/CUT3R/eval/video_depth/metadata.py b/extern/CUT3R/eval/video_depth/metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..48e30d9751af2aafd9c84b2dc059a9ee160a4475
--- /dev/null
+++ b/extern/CUT3R/eval/video_depth/metadata.py
@@ -0,0 +1,177 @@
+import os
+import glob
+from tqdm import tqdm
+
+# Define the merged dataset metadata dictionary
+dataset_metadata = {
+    "davis": {
+        "img_path": "data/davis/DAVIS/JPEGImages/480p",
+        "mask_path": "data/davis/DAVIS/masked_images/480p",
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq),
+        "gt_traj_func": lambda img_path, anno_path, seq: None,
+        "traj_format": None,
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: os.path.join(mask_path, seq),
+        "skip_condition": None,
+        "process_func": None,  # Not used in mono depth estimation
+    },
+    "kitti": {
+        "img_path": "data/kitti/depth_selection/val_selection_cropped/image_gathered",  # Default path
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq),
+        "gt_traj_func": lambda img_path, anno_path, seq: None,
+        "traj_format": None,
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": lambda args, img_path: process_kitti(args, img_path),
+    },
+    "bonn": {
+        "img_path": "data/bonn/rgbd_bonn_dataset",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(
+            img_path, f"rgbd_bonn_{seq}", "rgb_110"
+        ),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, f"rgbd_bonn_{seq}", "groundtruth_110.txt"
+        ),
+        "traj_format": "tum",
+        "seq_list": ["balloon2", "crowd2", "crowd3", "person_tracking2", "synchronous"],
+        "full_seq": False,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": lambda args, img_path: process_bonn(args, img_path),
+    },
+    "nyu": {
+        "img_path": "data/nyu-v2/val/nyu_images",
+        "mask_path": None,
+        "process_func": lambda args, img_path: process_nyu(args, img_path),
+    },
+    "scannet": {
+        "img_path": "data/scannetv2",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq, "color_90"),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, seq, "pose_90.txt"
+        ),
+        "traj_format": "replica",
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,  # lambda save_dir, seq: os.path.exists(os.path.join(save_dir, seq)),
+        "process_func": lambda args, img_path: process_scannet(args, img_path),
+    },
+    "tum": {
+        "img_path": "data/tum",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq, "rgb_90"),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, seq, "groundtruth_90.txt"
+        ),
+        "traj_format": "tum",
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": None,
+    },
+    "sintel": {
+        "img_path": "data/sintel/training/final",
+        "anno_path": "data/sintel/training/camdata_left",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(anno_path, seq),
+        "traj_format": None,
+        "seq_list": [
+            "alley_2",
+            "ambush_4",
+            "ambush_5",
+            "ambush_6",
+            "cave_2",
+            "cave_4",
+            "market_2",
+            "market_5",
+            "market_6",
+            "shaman_3",
+            "sleeping_1",
+            "sleeping_2",
+            "temple_2",
+            "temple_3",
+        ],
+        "full_seq": False,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": lambda args, img_path: process_sintel(args, img_path),
+    },
+}
+
+
+# Define processing functions for each dataset
+def process_kitti(args, img_path):
+    for dir in tqdm(sorted(glob.glob(f"{img_path}/*"))):
+        filelist = sorted(glob.glob(f"{dir}/*.png"))
+        save_dir = f"{args.output_dir}/{os.path.basename(dir)}"
+        yield filelist, save_dir
+
+
+def process_bonn(args, img_path):
+    if args.full_seq:
+        for dir in tqdm(sorted(glob.glob(f"{img_path}/*/"))):
+            filelist = sorted(glob.glob(f"{dir}/rgb/*.png"))
+            save_dir = f"{args.output_dir}/{os.path.basename(os.path.dirname(dir))}"
+            yield filelist, save_dir
+    else:
+        seq_list = (
+            ["balloon2", "crowd2", "crowd3", "person_tracking2", "synchronous"]
+            if args.seq_list is None
+            else args.seq_list
+        )
+        for seq in tqdm(seq_list):
+            filelist = sorted(glob.glob(f"{img_path}/rgbd_bonn_{seq}/rgb_110/*.png"))
+            save_dir = f"{args.output_dir}/{seq}"
+            yield filelist, save_dir
+
+
+def process_nyu(args, img_path):
+    filelist = sorted(glob.glob(f"{img_path}/*.png"))
+    save_dir = f"{args.output_dir}"
+    yield filelist, save_dir
+
+
+def process_scannet(args, img_path):
+    seq_list = sorted(glob.glob(f"{img_path}/*"))
+    for seq in tqdm(seq_list):
+        filelist = sorted(glob.glob(f"{seq}/color_90/*.jpg"))
+        save_dir = f"{args.output_dir}/{os.path.basename(seq)}"
+        yield filelist, save_dir
+
+
+def process_sintel(args, img_path):
+    if args.full_seq:
+        for dir in tqdm(sorted(glob.glob(f"{img_path}/*/"))):
+            filelist = sorted(glob.glob(f"{dir}/*.png"))
+            save_dir = f"{args.output_dir}/{os.path.basename(os.path.dirname(dir))}"
+            yield filelist, save_dir
+    else:
+        seq_list = [
+            "alley_2",
+            "ambush_4",
+            "ambush_5",
+            "ambush_6",
+            "cave_2",
+            "cave_4",
+            "market_2",
+            "market_5",
+            "market_6",
+            "shaman_3",
+            "sleeping_1",
+            "sleeping_2",
+            "temple_2",
+            "temple_3",
+        ]
+        for seq in tqdm(seq_list):
+            filelist = sorted(glob.glob(f"{img_path}/{seq}/*.png"))
+            save_dir = f"{args.output_dir}/{seq}"
+            yield filelist, save_dir
diff --git a/extern/CUT3R/eval/video_depth/run.sh b/extern/CUT3R/eval/video_depth/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2cd8a68cc58361a255f4716bea14e7293606b859
--- /dev/null
+++ b/extern/CUT3R/eval/video_depth/run.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -e
+
+workdir='.'
+model_name='ours'
+ckpt_name='cut3r_512_dpt_4_64'
+model_weights="${workdir}/src/${ckpt_name}.pth"
+datasets=('sintel' 'bonn' 'kitti')
+
+for data in "${datasets[@]}"; do
+    output_dir="${workdir}/eval_results/video_depth/${data}_${model_name}"
+    echo "$output_dir"
+    accelerate launch --num_processes 4  eval/video_depth/launch.py \
+        --weights "$model_weights" \
+        --output_dir "$output_dir" \
+        --eval_dataset "$data" \
+        --size 512
+    python eval/video_depth/eval_depth.py \
+    --output_dir "$output_dir" \
+    --eval_dataset "$data" \
+    --align "scale"
+done
diff --git a/extern/CUT3R/eval/video_depth/tools.py b/extern/CUT3R/eval/video_depth/tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6786fa6f25def0110ce22dbb7d44a7a08c952c8
--- /dev/null
+++ b/extern/CUT3R/eval/video_depth/tools.py
@@ -0,0 +1,399 @@
+import torch
+import numpy as np
+import cv2
+import glob
+import argparse
+from pathlib import Path
+from tqdm import tqdm
+from copy import deepcopy
+from scipy.optimize import minimize
+import os
+from collections import defaultdict
+
+
+def group_by_directory(pathes, idx=-1):
+    """
+    Groups the file paths based on the second-to-last directory in their paths.
+
+    Parameters:
+    - pathes (list): List of file paths.
+
+    Returns:
+    - dict: A dictionary where keys are the second-to-last directory names and values are lists of file paths.
+    """
+    grouped_pathes = defaultdict(list)
+
+    for path in pathes:
+        # Extract the second-to-last directory
+        dir_name = os.path.dirname(path).split("/")[idx]
+        grouped_pathes[dir_name].append(path)
+
+    return grouped_pathes
+
+
+def depth2disparity(depth, return_mask=False):
+    if isinstance(depth, torch.Tensor):
+        disparity = torch.zeros_like(depth)
+    elif isinstance(depth, np.ndarray):
+        disparity = np.zeros_like(depth)
+    non_negtive_mask = depth > 0
+    disparity[non_negtive_mask] = 1.0 / depth[non_negtive_mask]
+    if return_mask:
+        return disparity, non_negtive_mask
+    else:
+        return disparity
+
+
+def absolute_error_loss(params, predicted_depth, ground_truth_depth):
+    s, t = params
+
+    predicted_aligned = s * predicted_depth + t
+
+    abs_error = np.abs(predicted_aligned - ground_truth_depth)
+    return np.sum(abs_error)
+
+
+def absolute_value_scaling(predicted_depth, ground_truth_depth, s=1, t=0):
+    predicted_depth_np = predicted_depth.cpu().numpy().reshape(-1)
+    ground_truth_depth_np = ground_truth_depth.cpu().numpy().reshape(-1)
+
+    initial_params = [s, t]  # s = 1, t = 0
+
+    result = minimize(
+        absolute_error_loss,
+        initial_params,
+        args=(predicted_depth_np, ground_truth_depth_np),
+    )
+
+    s, t = result.x
+    return s, t
+
+
+def absolute_value_scaling2(
+    predicted_depth,
+    ground_truth_depth,
+    s_init=1.0,
+    t_init=0.0,
+    lr=1e-4,
+    max_iters=1000,
+    tol=1e-6,
+):
+    # Initialize s and t as torch tensors with requires_grad=True
+    s = torch.tensor(
+        [s_init],
+        requires_grad=True,
+        device=predicted_depth.device,
+        dtype=predicted_depth.dtype,
+    )
+    t = torch.tensor(
+        [t_init],
+        requires_grad=True,
+        device=predicted_depth.device,
+        dtype=predicted_depth.dtype,
+    )
+
+    optimizer = torch.optim.Adam([s, t], lr=lr)
+
+    prev_loss = None
+
+    for i in range(max_iters):
+        optimizer.zero_grad()
+
+        # Compute predicted aligned depth
+        predicted_aligned = s * predicted_depth + t
+
+        # Compute absolute error
+        abs_error = torch.abs(predicted_aligned - ground_truth_depth)
+
+        # Compute loss
+        loss = torch.sum(abs_error)
+
+        # Backpropagate
+        loss.backward()
+
+        # Update parameters
+        optimizer.step()
+
+        # Check convergence
+        if prev_loss is not None and torch.abs(prev_loss - loss) < tol:
+            break
+
+        prev_loss = loss.item()
+
+    return s.detach().item(), t.detach().item()
+
+
+def depth_evaluation(
+    predicted_depth_original,
+    ground_truth_depth_original,
+    max_depth=80,
+    custom_mask=None,
+    post_clip_min=None,
+    post_clip_max=None,
+    pre_clip_min=None,
+    pre_clip_max=None,
+    align_with_lstsq=False,
+    align_with_lad=False,
+    align_with_lad2=False,
+    metric_scale=False,
+    lr=1e-4,
+    max_iters=1000,
+    use_gpu=False,
+    align_with_scale=False,
+    disp_input=False,
+):
+    """
+    Evaluate the depth map using various metrics and return a depth error parity map, with an option for least squares alignment.
+
+    Args:
+        predicted_depth (numpy.ndarray or torch.Tensor): The predicted depth map.
+        ground_truth_depth (numpy.ndarray or torch.Tensor): The ground truth depth map.
+        max_depth (float): The maximum depth value to consider. Default is 80 meters.
+        align_with_lstsq (bool): If True, perform least squares alignment of the predicted depth with ground truth.
+
+    Returns:
+        dict: A dictionary containing the evaluation metrics.
+        torch.Tensor: The depth error parity map.
+    """
+    if isinstance(predicted_depth_original, np.ndarray):
+        predicted_depth_original = torch.from_numpy(predicted_depth_original)
+    if isinstance(ground_truth_depth_original, np.ndarray):
+        ground_truth_depth_original = torch.from_numpy(ground_truth_depth_original)
+    if custom_mask is not None and isinstance(custom_mask, np.ndarray):
+        custom_mask = torch.from_numpy(custom_mask)
+
+    # if the dimension is 3, flatten to 2d along the batch dimension
+    if predicted_depth_original.dim() == 3:
+        _, h, w = predicted_depth_original.shape
+        predicted_depth_original = predicted_depth_original.view(-1, w)
+        ground_truth_depth_original = ground_truth_depth_original.view(-1, w)
+        if custom_mask is not None:
+            custom_mask = custom_mask.view(-1, w)
+
+    # put to device
+    if use_gpu:
+        predicted_depth_original = predicted_depth_original.cuda()
+        ground_truth_depth_original = ground_truth_depth_original.cuda()
+
+    # Filter out depths greater than max_depth
+    if max_depth is not None:
+        mask = (ground_truth_depth_original > 0) & (
+            ground_truth_depth_original < max_depth
+        )
+    else:
+        mask = ground_truth_depth_original > 0
+    predicted_depth = predicted_depth_original[mask]
+    ground_truth_depth = ground_truth_depth_original[mask]
+
+    # Clip the depth values
+    if pre_clip_min is not None:
+        predicted_depth = torch.clamp(predicted_depth, min=pre_clip_min)
+    if pre_clip_max is not None:
+        predicted_depth = torch.clamp(predicted_depth, max=pre_clip_max)
+
+    if disp_input:  # align the pred to gt in the disparity space
+        real_gt = ground_truth_depth.clone()
+        ground_truth_depth = 1 / (ground_truth_depth + 1e-8)
+
+    # various alignment methods
+    if metric_scale:
+        predicted_depth = predicted_depth
+    elif align_with_lstsq:
+        # Convert to numpy for lstsq
+        predicted_depth_np = predicted_depth.cpu().numpy().reshape(-1, 1)
+        ground_truth_depth_np = ground_truth_depth.cpu().numpy().reshape(-1, 1)
+
+        # Add a column of ones for the shift term
+        A = np.hstack([predicted_depth_np, np.ones_like(predicted_depth_np)])
+
+        # Solve for scale (s) and shift (t) using least squares
+        result = np.linalg.lstsq(A, ground_truth_depth_np, rcond=None)
+        s, t = result[0][0], result[0][1]
+
+        # convert to torch tensor
+        s = torch.tensor(s, device=predicted_depth_original.device)
+        t = torch.tensor(t, device=predicted_depth_original.device)
+
+        # Apply scale and shift
+        predicted_depth = s * predicted_depth + t
+    elif align_with_lad:
+        s, t = absolute_value_scaling(
+            predicted_depth,
+            ground_truth_depth,
+            s=torch.median(ground_truth_depth) / torch.median(predicted_depth),
+        )
+        predicted_depth = s * predicted_depth + t
+    elif align_with_lad2:
+        s_init = (
+            torch.median(ground_truth_depth) / torch.median(predicted_depth)
+        ).item()
+        s, t = absolute_value_scaling2(
+            predicted_depth,
+            ground_truth_depth,
+            s_init=s_init,
+            lr=lr,
+            max_iters=max_iters,
+        )
+        predicted_depth = s * predicted_depth + t
+    elif align_with_scale:
+        # Compute initial scale factor 's' using the closed-form solution (L2 norm)
+        dot_pred_gt = torch.nanmean(ground_truth_depth)
+        dot_pred_pred = torch.nanmean(predicted_depth)
+        s = dot_pred_gt / dot_pred_pred
+
+        # Iterative reweighted least squares using the Weiszfeld method
+        for _ in range(10):
+            # Compute residuals between scaled predictions and ground truth
+            residuals = s * predicted_depth - ground_truth_depth
+            abs_residuals = (
+                residuals.abs() + 1e-8
+            )  # Add small constant to avoid division by zero
+
+            # Compute weights inversely proportional to the residuals
+            weights = 1.0 / abs_residuals
+
+            # Update 's' using weighted sums
+            weighted_dot_pred_gt = torch.sum(
+                weights * predicted_depth * ground_truth_depth
+            )
+            weighted_dot_pred_pred = torch.sum(weights * predicted_depth**2)
+            s = weighted_dot_pred_gt / weighted_dot_pred_pred
+
+        # Optionally clip 's' to prevent extreme scaling
+        s = s.clamp(min=1e-3)
+
+        # Detach 's' if you want to stop gradients from flowing through it
+        s = s.detach()
+
+        # Apply the scale factor to the predicted depth
+        predicted_depth = s * predicted_depth
+
+    else:
+        # Align the predicted depth with the ground truth using median scaling
+        scale_factor = torch.median(ground_truth_depth) / torch.median(predicted_depth)
+        predicted_depth *= scale_factor
+
+    if disp_input:
+        # convert back to depth
+        ground_truth_depth = real_gt
+        predicted_depth = depth2disparity(predicted_depth)
+
+    # Clip the predicted depth values
+    if post_clip_min is not None:
+        predicted_depth = torch.clamp(predicted_depth, min=post_clip_min)
+    if post_clip_max is not None:
+        predicted_depth = torch.clamp(predicted_depth, max=post_clip_max)
+
+    if custom_mask is not None:
+        assert custom_mask.shape == ground_truth_depth_original.shape
+        mask_within_mask = custom_mask.cpu()[mask]
+        predicted_depth = predicted_depth[mask_within_mask]
+        ground_truth_depth = ground_truth_depth[mask_within_mask]
+
+    # Calculate the metrics
+    abs_rel = torch.mean(
+        torch.abs(predicted_depth - ground_truth_depth) / ground_truth_depth
+    ).item()
+    sq_rel = torch.mean(
+        ((predicted_depth - ground_truth_depth) ** 2) / ground_truth_depth
+    ).item()
+
+    # Correct RMSE calculation
+    rmse = torch.sqrt(torch.mean((predicted_depth - ground_truth_depth) ** 2)).item()
+
+    # Clip the depth values to avoid log(0)
+    predicted_depth = torch.clamp(predicted_depth, min=1e-5)
+    log_rmse = torch.sqrt(
+        torch.mean((torch.log(predicted_depth) - torch.log(ground_truth_depth)) ** 2)
+    ).item()
+
+    # Calculate the accuracy thresholds
+    max_ratio = torch.maximum(
+        predicted_depth / ground_truth_depth, ground_truth_depth / predicted_depth
+    )
+    threshold_0 = torch.mean((max_ratio < 1.0).float()).item()
+    threshold_1 = torch.mean((max_ratio < 1.25).float()).item()
+    threshold_2 = torch.mean((max_ratio < 1.25**2).float()).item()
+    threshold_3 = torch.mean((max_ratio < 1.25**3).float()).item()
+
+    # Compute the depth error parity map
+    if metric_scale:
+        predicted_depth_original = predicted_depth_original
+        if disp_input:
+            predicted_depth_original = depth2disparity(predicted_depth_original)
+        depth_error_parity_map = (
+            torch.abs(predicted_depth_original - ground_truth_depth_original)
+            / ground_truth_depth_original
+        )
+    elif align_with_lstsq or align_with_lad or align_with_lad2:
+        predicted_depth_original = predicted_depth_original * s + t
+        if disp_input:
+            predicted_depth_original = depth2disparity(predicted_depth_original)
+        depth_error_parity_map = (
+            torch.abs(predicted_depth_original - ground_truth_depth_original)
+            / ground_truth_depth_original
+        )
+    elif align_with_scale:
+        predicted_depth_original = predicted_depth_original * s
+        if disp_input:
+            predicted_depth_original = depth2disparity(predicted_depth_original)
+        depth_error_parity_map = (
+            torch.abs(predicted_depth_original - ground_truth_depth_original)
+            / ground_truth_depth_original
+        )
+    else:
+        predicted_depth_original = predicted_depth_original * scale_factor
+        if disp_input:
+            predicted_depth_original = depth2disparity(predicted_depth_original)
+        depth_error_parity_map = (
+            torch.abs(predicted_depth_original - ground_truth_depth_original)
+            / ground_truth_depth_original
+        )
+
+    # Reshape the depth_error_parity_map back to the original image size
+    depth_error_parity_map_full = torch.zeros_like(ground_truth_depth_original)
+    depth_error_parity_map_full = torch.where(
+        mask, depth_error_parity_map, depth_error_parity_map_full
+    )
+
+    predict_depth_map_full = predicted_depth_original
+    gt_depth_map_full = torch.zeros_like(ground_truth_depth_original)
+    gt_depth_map_full = torch.where(
+        mask, ground_truth_depth_original, gt_depth_map_full
+    )
+
+    num_valid_pixels = (
+        torch.sum(mask).item()
+        if custom_mask is None
+        else torch.sum(mask_within_mask).item()
+    )
+    if num_valid_pixels == 0:
+        (
+            abs_rel,
+            sq_rel,
+            rmse,
+            log_rmse,
+            threshold_0,
+            threshold_1,
+            threshold_2,
+            threshold_3,
+        ) = (0, 0, 0, 0, 0, 0, 0, 0)
+
+    results = {
+        "Abs Rel": abs_rel,
+        "Sq Rel": sq_rel,
+        "RMSE": rmse,
+        "Log RMSE": log_rmse,
+        "δ < 1.": threshold_0,
+        "δ < 1.25": threshold_1,
+        "δ < 1.25^2": threshold_2,
+        "δ < 1.25^3": threshold_3,
+        "valid_pixels": num_valid_pixels,
+    }
+
+    return (
+        results,
+        depth_error_parity_map_full,
+        predict_depth_map_full,
+        gt_depth_map_full,
+    )
diff --git a/extern/CUT3R/eval/video_depth/utils.py b/extern/CUT3R/eval/video_depth/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b34c421c2b2138ed62f8627f5f37c2eb6c63a771
--- /dev/null
+++ b/extern/CUT3R/eval/video_depth/utils.py
@@ -0,0 +1,236 @@
+from copy import deepcopy
+import cv2
+
+import numpy as np
+import torch
+import torch.nn as nn
+import roma
+from copy import deepcopy
+import tqdm
+import matplotlib as mpl
+import matplotlib.cm as cm
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from scipy.spatial.transform import Rotation
+from PIL import Image
+import imageio.v2 as iio
+from matplotlib.figure import Figure
+
+
+def save_focals(cam_dict, path):
+    # convert focal to txt
+    focals = cam_dict["focal"]
+    np.savetxt(path, focals, fmt="%.6f")
+    return focals
+
+
+def save_intrinsics(cam_dict, path):
+    K_raw = np.eye(3)[None].repeat(len(cam_dict["focal"]), axis=0)
+    K_raw[:, 0, 0] = cam_dict["focal"]
+    K_raw[:, 1, 1] = cam_dict["focal"]
+    K_raw[:, :2, 2] = cam_dict["pp"]
+    K = K_raw.reshape(-1, 9)
+    np.savetxt(path, K, fmt="%.6f")
+    return K_raw
+
+
+def save_conf_maps(conf, path):
+    for i, c in enumerate(conf):
+        np.save(f"{path}/conf_{i}.npy", c.detach().cpu().numpy())
+    return conf
+
+
+def save_rgb_imgs(colors, path):
+    imgs = colors
+    for i, img in enumerate(imgs):
+        # convert from rgb to bgr
+        iio.imwrite(
+            f"{path}/frame_{i:04d}.jpg", (img.cpu().numpy() * 255).astype(np.uint8)
+        )
+    return imgs
+
+
+def save_depth_maps(pts3ds_self, path, conf_self=None):
+    depth_maps = torch.stack([pts3d_self[..., -1] for pts3d_self in pts3ds_self], 0)
+    min_depth = depth_maps.min()  # float(torch.quantile(out, 0.01))
+    max_depth = depth_maps.max()  # float(torch.quantile(out, 0.99))
+    colored_depth = colorize(
+        depth_maps,
+        cmap_name="Spectral_r",
+        range=(min_depth, max_depth),
+        append_cbar=True,
+    )
+    images = []
+
+    if conf_self is not None:
+        conf_selfs = torch.concat(conf_self, 0)
+        min_conf = torch.log(conf_selfs.min())  # float(torch.quantile(out, 0.01))
+        max_conf = torch.log(conf_selfs.max())  # float(torch.quantile(out, 0.99))
+        colored_conf = colorize(
+            torch.log(conf_selfs),
+            cmap_name="jet",
+            range=(min_conf, max_conf),
+            append_cbar=True,
+        )
+
+    for i, depth_map in enumerate(colored_depth):
+        # Apply color map to depth map
+        img_path = f"{path}/frame_{(i):04d}.png"
+        if conf_self is None:
+            to_save = (depth_map * 255).detach().cpu().numpy().astype(np.uint8)
+        else:
+            to_save = torch.cat([depth_map, colored_conf[i]], dim=1)
+            to_save = (to_save * 255).detach().cpu().numpy().astype(np.uint8)
+        iio.imwrite(img_path, to_save)
+        images.append(Image.open(img_path))
+        np.save(f"{path}/frame_{(i):04d}.npy", depth_maps[i].detach().cpu().numpy())
+
+    # comment this as it may fail sometimes
+    # images[0].save(f'{path}/_depth_maps.gif', save_all=True, append_images=images[1:], duration=100, loop=0)
+
+    return depth_maps
+
+
+def get_vertical_colorbar(h, vmin, vmax, cmap_name="jet", label=None, cbar_precision=2):
+    """
+    :param w: pixels
+    :param h: pixels
+    :param vmin: min value
+    :param vmax: max value
+    :param cmap_name:
+    :param label
+    :return:
+    """
+    fig = Figure(figsize=(2, 8), dpi=100)
+    fig.subplots_adjust(right=1.5)
+    canvas = FigureCanvasAgg(fig)
+
+    # Do some plotting.
+    ax = fig.add_subplot(111)
+    cmap = cm.get_cmap(cmap_name)
+    norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
+
+    tick_cnt = 6
+    tick_loc = np.linspace(vmin, vmax, tick_cnt)
+    cb1 = mpl.colorbar.ColorbarBase(
+        ax, cmap=cmap, norm=norm, ticks=tick_loc, orientation="vertical"
+    )
+
+    tick_label = [str(np.round(x, cbar_precision)) for x in tick_loc]
+    if cbar_precision == 0:
+        tick_label = [x[:-2] for x in tick_label]
+
+    cb1.set_ticklabels(tick_label)
+
+    cb1.ax.tick_params(labelsize=18, rotation=0)
+    if label is not None:
+        cb1.set_label(label)
+
+    # fig.tight_layout()
+
+    canvas.draw()
+    s, (width, height) = canvas.print_to_buffer()
+
+    im = np.frombuffer(s, np.uint8).reshape((height, width, 4))
+
+    im = im[:, :, :3].astype(np.float32) / 255.0
+    if h != im.shape[0]:
+        w = int(im.shape[1] / im.shape[0] * h)
+        im = cv2.resize(im, (w, h), interpolation=cv2.INTER_AREA)
+
+    return im
+
+
+def colorize_np(
+    x,
+    cmap_name="jet",
+    mask=None,
+    range=None,
+    append_cbar=False,
+    cbar_in_image=False,
+    cbar_precision=2,
+):
+    """
+    turn a grayscale image into a color image
+    :param x: input grayscale, [H, W]
+    :param cmap_name: the colorization method
+    :param mask: the mask image, [H, W]
+    :param range: the range for scaling, automatic if None, [min, max]
+    :param append_cbar: if append the color bar
+    :param cbar_in_image: put the color bar inside the image to keep the output image the same size as the input image
+    :return: colorized image, [H, W]
+    """
+    if range is not None:
+        vmin, vmax = range
+    elif mask is not None:
+        # vmin, vmax = np.percentile(x[mask], (2, 100))
+        vmin = np.min(x[mask][np.nonzero(x[mask])])
+        vmax = np.max(x[mask])
+        # vmin = vmin - np.abs(vmin) * 0.01
+        x[np.logical_not(mask)] = vmin
+        # print(vmin, vmax)
+    else:
+        vmin, vmax = np.percentile(x, (1, 100))
+        vmax += 1e-6
+
+    x = np.clip(x, vmin, vmax)
+    x = (x - vmin) / (vmax - vmin)
+    # x = np.clip(x, 0., 1.)
+
+    cmap = cm.get_cmap(cmap_name)
+    x_new = cmap(x)[:, :, :3]
+
+    if mask is not None:
+        mask = np.float32(mask[:, :, np.newaxis])
+        x_new = x_new * mask + np.ones_like(x_new) * (1.0 - mask)
+
+    cbar = get_vertical_colorbar(
+        h=x.shape[0],
+        vmin=vmin,
+        vmax=vmax,
+        cmap_name=cmap_name,
+        cbar_precision=cbar_precision,
+    )
+
+    if append_cbar:
+        if cbar_in_image:
+            x_new[:, -cbar.shape[1] :, :] = cbar
+        else:
+            x_new = np.concatenate(
+                (x_new, np.zeros_like(x_new[:, :5, :]), cbar), axis=1
+            )
+        return x_new
+    else:
+        return x_new
+
+
+# tensor
+def colorize(
+    x, cmap_name="jet", mask=None, range=None, append_cbar=False, cbar_in_image=False
+):
+    """
+    turn a grayscale image into a color image
+    :param x: torch.Tensor, grayscale image, [H, W] or [B, H, W]
+    :param mask: torch.Tensor or None, mask image, [H, W] or [B, H, W] or None
+    """
+
+    device = x.device
+    x = x.cpu().numpy()
+    if mask is not None:
+        mask = mask.cpu().numpy() > 0.99
+        kernel = np.ones((3, 3), np.uint8)
+
+    if x.ndim == 2:
+        x = x[None]
+        if mask is not None:
+            mask = mask[None]
+
+    out = []
+    for x_ in x:
+        if mask is not None:
+            mask = cv2.erode(mask.astype(np.uint8), kernel, iterations=1).astype(bool)
+
+        x_ = colorize_np(x_, cmap_name, mask, range, append_cbar, cbar_in_image)
+        out.append(torch.from_numpy(x_).to(device).float())
+    out = torch.stack(out).squeeze(0)
+    return out
diff --git a/extern/CUT3R/requirements.txt b/extern/CUT3R/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c4ad28b0401fbdfe269baa90293554f4712d71ce
--- /dev/null
+++ b/extern/CUT3R/requirements.txt
@@ -0,0 +1,23 @@
+numpy==1.26.4
+torch
+torchvision
+roma
+gradio
+matplotlib
+tqdm
+opencv-python
+scipy
+einops
+trimesh
+tensorboard
+pyglet<2
+huggingface-hub[torch]>=0.22
+viser
+gradio
+lpips
+hydra-core
+pillow==10.3.0
+h5py
+accelerate
+transformers
+scikit-learn
\ No newline at end of file
diff --git a/extern/CUT3R/src/croco/croco-stereo-flow-demo.ipynb b/extern/CUT3R/src/croco/croco-stereo-flow-demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..2b00a7607ab5f82d1857041969bfec977e56b3e0
--- /dev/null
+++ b/extern/CUT3R/src/croco/croco-stereo-flow-demo.ipynb
@@ -0,0 +1,191 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9bca0f41",
+   "metadata": {},
+   "source": [
+    "# Simple inference example with CroCo-Stereo or CroCo-Flow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "80653ef7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n",
+    "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f033862",
+   "metadata": {},
+   "source": [
+    "First download the model(s) of your choice by running\n",
+    "```\n",
+    "bash stereoflow/download_model.sh crocostereo.pth\n",
+    "bash stereoflow/download_model.sh crocoflow.pth\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fb2e392",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n",
+    "device = torch.device('cuda:0' if use_gpu else 'cpu')\n",
+    "import matplotlib.pylab as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0e25d77",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from stereoflow.test import _load_model_and_criterion\n",
+    "from stereoflow.engine import tiled_pred\n",
+    "from stereoflow.datasets_stereo import img_to_tensor, vis_disparity\n",
+    "from stereoflow.datasets_flow import flowToColor\n",
+    "tile_overlap=0.7 # recommended value, higher value can be slightly better but slower"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "86a921f5",
+   "metadata": {},
+   "source": [
+    "### CroCo-Stereo example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64e483cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image1 = np.asarray(Image.open('<path_to_left_image>'))\n",
+    "image2 = np.asarray(Image.open('<path_to_right_image>'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0d04303",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocostereo.pth', None, device)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47dc14b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n",
+    "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n",
+    "with torch.inference_mode():\n",
+    "    pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n",
+    "pred = pred.squeeze(0).squeeze(0).cpu().numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "583b9f16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(vis_disparity(pred))\n",
+    "plt.axis('off')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d2df5d70",
+   "metadata": {},
+   "source": [
+    "### CroCo-Flow example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ee257a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image1 = np.asarray(Image.open('<path_to_first_image>'))\n",
+    "image2 = np.asarray(Image.open('<path_to_second_image>'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5edccf0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocoflow.pth', None, device)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b19692c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n",
+    "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n",
+    "with torch.inference_mode():\n",
+    "    pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n",
+    "pred = pred.squeeze(0).permute(1,2,0).cpu().numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26f79db3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(flowToColor(pred))\n",
+    "plt.axis('off')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/extern/CUT3R/src/croco/datasets/__init__.py b/extern/CUT3R/src/croco/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/extern/CUT3R/src/croco/datasets/crops/README.MD b/extern/CUT3R/src/croco/datasets/crops/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..47ddabebb177644694ee247ae878173a3a16644f
--- /dev/null
+++ b/extern/CUT3R/src/croco/datasets/crops/README.MD
@@ -0,0 +1,104 @@
+## Generation of crops from the real datasets
+
+The instructions below allow to generate the crops used for pre-training CroCo v2 from the following real-world datasets: ARKitScenes, MegaDepth, 3DStreetView and IndoorVL.
+
+### Download the metadata of the crops to generate 
+
+First, download the metadata and put them in `./data/`:
+```
+mkdir -p data
+cd data/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/crop_metadata.zip
+unzip crop_metadata.zip
+rm crop_metadata.zip
+cd ..
+```
+
+### Prepare the original datasets 
+
+Second, download the original datasets in `./data/original_datasets/`.
+```
+mkdir -p data/original_datasets
+```
+
+##### ARKitScenes
+
+Download the `raw` dataset from https://github.com/apple/ARKitScenes/blob/main/DATA.md and put it in `./data/original_datasets/ARKitScenes/`.
+The resulting file structure should be like:
+```
+./data/original_datasets/ARKitScenes/
+└───Training
+    └───40753679
+     │  │   ultrawide
+     │  │   ...
+     └───40753686
+     │   
+      ...
+```
+
+##### MegaDepth
+
+Download `MegaDepth v1 Dataset` from https://www.cs.cornell.edu/projects/megadepth/ and put it in `./data/original_datasets/MegaDepth/`.
+The resulting file structure should be like:
+
+```
+./data/original_datasets/MegaDepth/
+└───0000
+│   └───images
+│    │      │   1000557903_87fa96b8a4_o.jpg
+│    │      └ ...
+│    └─── ...
+└───0001
+│   │   
+│   └ ...
+└─── ...
+```
+
+##### 3DStreetView
+
+Download `3D_Street_View` dataset from https://github.com/amir32002/3D_Street_View and put it in `./data/original_datasets/3DStreetView/`.
+The resulting file structure should be like:
+
+``` 
+./data/original_datasets/3DStreetView/
+└───dataset_aligned
+│   └───0002
+│    │      │   0000002_0000001_0000002_0000001.jpg
+│    │      └ ...
+│    └─── ...
+└───dataset_unaligned
+│   └───0003
+│    │      │   0000003_0000001_0000002_0000001.jpg
+│    │      └ ...
+│    └─── ...
+```
+
+##### IndoorVL
+
+Download the `IndoorVL` datasets using [Kapture](https://github.com/naver/kapture).
+
+```
+pip install kapture
+mkdir -p ./data/original_datasets/IndoorVL
+cd ./data/original_datasets/IndoorVL
+kapture_download_dataset.py update
+kapture_download_dataset.py install  "HyundaiDepartmentStore_*"
+kapture_download_dataset.py install  "GangnamStation_*"
+cd -
+```
+
+### Extract the crops
+
+Now, extract the crops for each of the dataset:
+```
+for dataset in ARKitScenes MegaDepth 3DStreetView IndoorVL; 
+do 
+  python3 datasets/crops/extract_crops_from_images.py --crops ./data/crop_metadata/${dataset}/crops_release.txt --root-dir ./data/original_datasets/${dataset}/ --output-dir ./data/${dataset}_crops/ --imsize 256 --nthread 8 --max-subdir-levels 5 --ideal-number-pairs-in-dir 500;
+done
+```
+
+##### Note for IndoorVL
+
+Due to some legal issues, we can only release 144,228 pairs out of the 1,593,689 pairs used in the paper.
+To account for it in terms of number of pre-training iterations, the pre-training command in this repository uses 125 training epochs including 12 warm-up epochs and learning rate cosine schedule of 250, instead of 100, 10 and 200 respectively.
+The impact on the performance is negligible.
diff --git a/extern/CUT3R/src/croco/datasets/crops/extract_crops_from_images.py b/extern/CUT3R/src/croco/datasets/crops/extract_crops_from_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..870cf9f9690bfc53f10a59293aabc16da127b02e
--- /dev/null
+++ b/extern/CUT3R/src/croco/datasets/crops/extract_crops_from_images.py
@@ -0,0 +1,183 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Extracting crops for pre-training
+# --------------------------------------------------------
+
+import os
+import argparse
+from tqdm import tqdm
+from PIL import Image
+import functools
+from multiprocessing import Pool
+import math
+
+
+def arg_parser():
+    parser = argparse.ArgumentParser(
+        "Generate cropped image pairs from image crop list"
+    )
+
+    parser.add_argument("--crops", type=str, required=True, help="crop file")
+    parser.add_argument("--root-dir", type=str, required=True, help="root directory")
+    parser.add_argument(
+        "--output-dir", type=str, required=True, help="output directory"
+    )
+    parser.add_argument("--imsize", type=int, default=256, help="size of the crops")
+    parser.add_argument(
+        "--nthread", type=int, required=True, help="number of simultaneous threads"
+    )
+    parser.add_argument(
+        "--max-subdir-levels",
+        type=int,
+        default=5,
+        help="maximum number of subdirectories",
+    )
+    parser.add_argument(
+        "--ideal-number-pairs-in-dir",
+        type=int,
+        default=500,
+        help="number of pairs stored in a dir",
+    )
+    return parser
+
+
+def main(args):
+    listing_path = os.path.join(args.output_dir, "listing.txt")
+
+    print(f"Loading list of crops ... ({args.nthread} threads)")
+    crops, num_crops_to_generate = load_crop_file(args.crops)
+
+    print(f"Preparing jobs ({len(crops)} candidate image pairs)...")
+    num_levels = min(
+        math.ceil(math.log(num_crops_to_generate, args.ideal_number_pairs_in_dir)),
+        args.max_subdir_levels,
+    )
+    num_pairs_in_dir = math.ceil(num_crops_to_generate ** (1 / num_levels))
+
+    jobs = prepare_jobs(crops, num_levels, num_pairs_in_dir)
+    del crops
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    mmap = Pool(args.nthread).imap_unordered if args.nthread > 1 else map
+    call = functools.partial(save_image_crops, args)
+
+    print(f"Generating cropped images to {args.output_dir} ...")
+    with open(listing_path, "w") as listing:
+        listing.write("# pair_path\n")
+        for results in tqdm(mmap(call, jobs), total=len(jobs)):
+            for path in results:
+                listing.write(f"{path}\n")
+    print("Finished writing listing to", listing_path)
+
+
+def load_crop_file(path):
+    data = open(path).read().splitlines()
+    pairs = []
+    num_crops_to_generate = 0
+    for line in tqdm(data):
+        if line.startswith("#"):
+            continue
+        line = line.split(", ")
+        if len(line) < 8:
+            img1, img2, rotation = line
+            pairs.append((img1, img2, int(rotation), []))
+        else:
+            l1, r1, t1, b1, l2, r2, t2, b2 = map(int, line)
+            rect1, rect2 = (l1, t1, r1, b1), (l2, t2, r2, b2)
+            pairs[-1][-1].append((rect1, rect2))
+            num_crops_to_generate += 1
+    return pairs, num_crops_to_generate
+
+
+def prepare_jobs(pairs, num_levels, num_pairs_in_dir):
+    jobs = []
+    powers = [num_pairs_in_dir**level for level in reversed(range(num_levels))]
+
+    def get_path(idx):
+        idx_array = []
+        d = idx
+        for level in range(num_levels - 1):
+            idx_array.append(idx // powers[level])
+            idx = idx % powers[level]
+        idx_array.append(d)
+        return "/".join(map(lambda x: hex(x)[2:], idx_array))
+
+    idx = 0
+    for pair_data in tqdm(pairs):
+        img1, img2, rotation, crops = pair_data
+        if -60 <= rotation and rotation <= 60:
+            rotation = 0  # most likely not a true rotation
+        paths = [get_path(idx + k) for k in range(len(crops))]
+        idx += len(crops)
+        jobs.append(((img1, img2), rotation, crops, paths))
+    return jobs
+
+
+def load_image(path):
+    try:
+        return Image.open(path).convert("RGB")
+    except Exception as e:
+        print("skipping", path, e)
+        raise OSError()
+
+
+def save_image_crops(args, data):
+    # load images
+    img_pair, rot, crops, paths = data
+    try:
+        img1, img2 = [
+            load_image(os.path.join(args.root_dir, impath)) for impath in img_pair
+        ]
+    except OSError as e:
+        return []
+
+    def area(sz):
+        return sz[0] * sz[1]
+
+    tgt_size = (args.imsize, args.imsize)
+
+    def prepare_crop(img, rect, rot=0):
+        # actual crop
+        img = img.crop(rect)
+
+        # resize to desired size
+        interp = (
+            Image.Resampling.LANCZOS
+            if area(img.size) > 4 * area(tgt_size)
+            else Image.Resampling.BICUBIC
+        )
+        img = img.resize(tgt_size, resample=interp)
+
+        # rotate the image
+        rot90 = (round(rot / 90) % 4) * 90
+        if rot90 == 90:
+            img = img.transpose(Image.Transpose.ROTATE_90)
+        elif rot90 == 180:
+            img = img.transpose(Image.Transpose.ROTATE_180)
+        elif rot90 == 270:
+            img = img.transpose(Image.Transpose.ROTATE_270)
+        return img
+
+    results = []
+    for (rect1, rect2), path in zip(crops, paths):
+        crop1 = prepare_crop(img1, rect1)
+        crop2 = prepare_crop(img2, rect2, rot)
+
+        fullpath1 = os.path.join(args.output_dir, path + "_1.jpg")
+        fullpath2 = os.path.join(args.output_dir, path + "_2.jpg")
+        os.makedirs(os.path.dirname(fullpath1), exist_ok=True)
+
+        assert not os.path.isfile(fullpath1), fullpath1
+        assert not os.path.isfile(fullpath2), fullpath2
+        crop1.save(fullpath1)
+        crop2.save(fullpath2)
+        results.append(path)
+
+    return results
+
+
+if __name__ == "__main__":
+    args = arg_parser().parse_args()
+    main(args)
diff --git a/extern/CUT3R/src/croco/datasets/habitat_sim/README.MD b/extern/CUT3R/src/croco/datasets/habitat_sim/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..a505781ff9eb91bce7f1d189e848f8ba1c560940
--- /dev/null
+++ b/extern/CUT3R/src/croco/datasets/habitat_sim/README.MD
@@ -0,0 +1,76 @@
+## Generation of synthetic image pairs using Habitat-Sim
+
+These instructions allow to generate pre-training pairs from the Habitat simulator.
+As we did not save metadata of the pairs used in the original paper, they are not strictly the same, but these data use the same setting and are equivalent.
+
+### Download Habitat-Sim scenes
+Download Habitat-Sim scenes:
+- Download links can be found here: https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md
+- We used scenes from the HM3D, habitat-test-scenes, Replica, ReplicaCad and ScanNet datasets.
+- Please put the scenes under `./data/habitat-sim-data/scene_datasets/` following the structure below, or update manually paths in `paths.py`.
+```
+./data/
+└──habitat-sim-data/
+   └──scene_datasets/
+      ├──hm3d/
+      ├──gibson/
+      ├──habitat-test-scenes/
+      ├──replica_cad_baked_lighting/
+      ├──replica_cad/
+      ├──ReplicaDataset/
+      └──scannet/
+```
+
+### Image pairs generation
+We provide metadata to generate reproducible images pairs for pretraining and validation.
+Experiments described in the paper used similar data, but whose generation was not reproducible at the time.
+
+Specifications:
+- 256x256 resolution images, with 60 degrees field of view .
+- Up to 1000 image pairs per scene.
+- Number of scenes considered/number of images pairs per dataset:
+  - Scannet: 1097 scenes / 985 209 pairs
+  - HM3D:
+    - hm3d/train: 800 / 800k pairs
+    - hm3d/val: 100 scenes / 100k pairs
+    - hm3d/minival: 10 scenes / 10k pairs
+  - habitat-test-scenes: 3 scenes / 3k pairs
+  - replica_cad_baked_lighting: 13 scenes / 13k pairs
+
+- Scenes from hm3d/val and hm3d/minival pairs were not used for the pre-training but kept for validation purposes.
+
+Download metadata and extract it:
+```bash
+mkdir -p data/habitat_release_metadata/
+cd data/habitat_release_metadata/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/habitat_release_metadata/multiview_habitat_metadata.tar.gz
+tar -xvf multiview_habitat_metadata.tar.gz
+cd ../..
+# Location of the metadata
+METADATA_DIR="./data/habitat_release_metadata/multiview_habitat_metadata"
+```
+
+Generate image pairs from metadata:
+- The following command will print a list of commandlines to generate image pairs for each scene:
+```bash
+# Target output directory
+PAIRS_DATASET_DIR="./data/habitat_release/"
+python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR
+```
+- One can launch multiple of such commands in parallel e.g. using GNU Parallel:
+```bash
+python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR | parallel -j 16
+```
+
+## Metadata generation
+
+Image pairs were randomly sampled using the following commands, whose outputs contain randomness and are thus not exactly reproducible:
+```bash
+# Print commandlines to generate image pairs from the different scenes available.
+PAIRS_DATASET_DIR=MY_CUSTOM_PATH
+python datasets/habitat_sim/generate_multiview_images.py --list_commands --output_dir=$PAIRS_DATASET_DIR
+
+# Once a dataset is generated, pack metadata files for reproducibility.
+METADATA_DIR=MY_CUSTON_PATH
+python datasets/habitat_sim/pack_metadata_files.py $PAIRS_DATASET_DIR  $METADATA_DIR
+```
diff --git a/extern/CUT3R/src/croco/datasets/habitat_sim/__init__.py b/extern/CUT3R/src/croco/datasets/habitat_sim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/extern/CUT3R/src/croco/datasets/habitat_sim/generate_from_metadata.py b/extern/CUT3R/src/croco/datasets/habitat_sim/generate_from_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bbfbc6bec23e182baed2c4eedf0535fbc6aaa97
--- /dev/null
+++ b/extern/CUT3R/src/croco/datasets/habitat_sim/generate_from_metadata.py
@@ -0,0 +1,125 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Script to generate image pairs for a given scene reproducing poses provided in a metadata file.
+"""
+import os
+from datasets.habitat_sim.multiview_habitat_sim_generator import (
+    MultiviewHabitatSimGenerator,
+)
+from datasets.habitat_sim.paths import SCENES_DATASET
+import argparse
+import quaternion
+import PIL.Image
+import cv2
+import json
+from tqdm import tqdm
+
+
+def generate_multiview_images_from_metadata(
+    metadata_filename,
+    output_dir,
+    overload_params=dict(),
+    scene_datasets_paths=None,
+    exist_ok=False,
+):
+    """
+    Generate images from a metadata file for reproducibility purposes.
+    """
+    # Reorder paths by decreasing label length, to avoid collisions when testing if a string by such label
+    if scene_datasets_paths is not None:
+        scene_datasets_paths = dict(
+            sorted(scene_datasets_paths.items(), key=lambda x: len(x[0]), reverse=True)
+        )
+
+    with open(metadata_filename, "r") as f:
+        input_metadata = json.load(f)
+    metadata = dict()
+    for key, value in input_metadata.items():
+        # Optionally replace some paths
+        if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "":
+            if scene_datasets_paths is not None:
+                for dataset_label, dataset_path in scene_datasets_paths.items():
+                    if value.startswith(dataset_label):
+                        value = os.path.normpath(
+                            os.path.join(
+                                dataset_path, os.path.relpath(value, dataset_label)
+                            )
+                        )
+                        break
+        metadata[key] = value
+
+    # Overload some parameters
+    for key, value in overload_params.items():
+        metadata[key] = value
+
+    generation_entries = dict(
+        [
+            (key, value)
+            for key, value in metadata.items()
+            if not (key in ("multiviews", "output_dir", "generate_depth"))
+        ]
+    )
+    generate_depth = metadata["generate_depth"]
+
+    os.makedirs(output_dir, exist_ok=exist_ok)
+
+    generator = MultiviewHabitatSimGenerator(**generation_entries)
+
+    # Generate views
+    for idx_label, data in tqdm(metadata["multiviews"].items()):
+        positions = data["positions"]
+        orientations = data["orientations"]
+        n = len(positions)
+        for oidx in range(n):
+            observation = generator.render_viewpoint(
+                positions[oidx], quaternion.from_float_array(orientations[oidx])
+            )
+            observation_label = f"{oidx + 1}"  # Leonid is indexing starting from 1
+            # Color image saved using PIL
+            img = PIL.Image.fromarray(observation["color"][:, :, :3])
+            filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg")
+            img.save(filename)
+            if generate_depth:
+                # Depth image as EXR file
+                filename = os.path.join(
+                    output_dir, f"{idx_label}_{observation_label}_depth.exr"
+                )
+                cv2.imwrite(
+                    filename,
+                    observation["depth"],
+                    [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF],
+                )
+                # Camera parameters
+                camera_params = dict(
+                    [
+                        (key, observation[key].tolist())
+                        for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")
+                    ]
+                )
+                filename = os.path.join(
+                    output_dir, f"{idx_label}_{observation_label}_camera_params.json"
+                )
+                with open(filename, "w") as f:
+                    json.dump(camera_params, f)
+                # Save metadata
+    with open(os.path.join(output_dir, "metadata.json"), "w") as f:
+        json.dump(metadata, f)
+
+    generator.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--metadata_filename", required=True)
+    parser.add_argument("--output_dir", required=True)
+    args = parser.parse_args()
+
+    generate_multiview_images_from_metadata(
+        metadata_filename=args.metadata_filename,
+        output_dir=args.output_dir,
+        scene_datasets_paths=SCENES_DATASET,
+        overload_params=dict(),
+        exist_ok=True,
+    )
diff --git a/extern/CUT3R/src/croco/datasets/habitat_sim/generate_from_metadata_files.py b/extern/CUT3R/src/croco/datasets/habitat_sim/generate_from_metadata_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..2376957e0578726a98515220167e86fbecc2d72d
--- /dev/null
+++ b/extern/CUT3R/src/croco/datasets/habitat_sim/generate_from_metadata_files.py
@@ -0,0 +1,36 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Script generating commandlines to generate image pairs from metadata files.
+"""
+import os
+import glob
+from tqdm import tqdm
+import argparse
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_dir", required=True)
+    parser.add_argument("--output_dir", required=True)
+    parser.add_argument(
+        "--prefix",
+        default="",
+        help="Commanline prefix, useful e.g. to setup environment.",
+    )
+    args = parser.parse_args()
+
+    input_metadata_filenames = glob.iglob(
+        f"{args.input_dir}/**/metadata.json", recursive=True
+    )
+
+    for metadata_filename in tqdm(input_metadata_filenames):
+        output_dir = os.path.join(
+            args.output_dir,
+            os.path.relpath(os.path.dirname(metadata_filename), args.input_dir),
+        )
+        # Do not process the scene if the metadata file already exists
+        if os.path.exists(os.path.join(output_dir, "metadata.json")):
+            continue
+        commandline = f"{args.prefix}python datasets/habitat_sim/generate_from_metadata.py --metadata_filename={metadata_filename} --output_dir={output_dir}"
+        print(commandline)
diff --git a/extern/CUT3R/src/croco/datasets/habitat_sim/generate_multiview_images.py b/extern/CUT3R/src/croco/datasets/habitat_sim/generate_multiview_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf16062135dfbaeb38ff2ad91c33bcab50cb98aa
--- /dev/null
+++ b/extern/CUT3R/src/croco/datasets/habitat_sim/generate_multiview_images.py
@@ -0,0 +1,231 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+from tqdm import tqdm
+import argparse
+import PIL.Image
+import numpy as np
+import json
+from datasets.habitat_sim.multiview_habitat_sim_generator import (
+    MultiviewHabitatSimGenerator,
+    NoNaviguableSpaceError,
+)
+from datasets.habitat_sim.paths import list_scenes_available
+import cv2
+import quaternion
+import shutil
+
+
+def generate_multiview_images_for_scene(
+    scene_dataset_config_file,
+    scene,
+    navmesh,
+    output_dir,
+    views_count,
+    size,
+    exist_ok=False,
+    generate_depth=False,
+    **kwargs,
+):
+    """
+    Generate tuples of overlapping views for a given scene.
+    generate_depth: generate depth images and camera parameters.
+    """
+    if os.path.exists(output_dir) and not exist_ok:
+        print(f"Scene {scene}: data already generated. Ignoring generation.")
+        return
+    try:
+        print(f"Scene {scene}: {size} multiview acquisitions to generate...")
+        os.makedirs(output_dir, exist_ok=exist_ok)
+
+        metadata_filename = os.path.join(output_dir, "metadata.json")
+
+        metadata_template = dict(
+            scene_dataset_config_file=scene_dataset_config_file,
+            scene=scene,
+            navmesh=navmesh,
+            views_count=views_count,
+            size=size,
+            generate_depth=generate_depth,
+            **kwargs,
+        )
+        metadata_template["multiviews"] = dict()
+
+        if os.path.exists(metadata_filename):
+            print("Metadata file already exists:", metadata_filename)
+            print("Loading already generated metadata file...")
+            with open(metadata_filename, "r") as f:
+                metadata = json.load(f)
+
+            for key in metadata_template.keys():
+                if key != "multiviews":
+                    assert (
+                        metadata_template[key] == metadata[key]
+                    ), f"existing file is inconsistent with the input parameters:\nKey: {key}\nmetadata: {metadata[key]}\ntemplate: {metadata_template[key]}."
+        else:
+            print("No temporary file found. Starting generation from scratch...")
+            metadata = metadata_template
+
+        starting_id = len(metadata["multiviews"])
+        print(f"Starting generation from index {starting_id}/{size}...")
+        if starting_id >= size:
+            print("Generation already done.")
+            return
+
+        generator = MultiviewHabitatSimGenerator(
+            scene_dataset_config_file=scene_dataset_config_file,
+            scene=scene,
+            navmesh=navmesh,
+            views_count=views_count,
+            size=size,
+            **kwargs,
+        )
+
+        for idx in tqdm(range(starting_id, size)):
+            # Generate / re-generate the observations
+            try:
+                data = generator[idx]
+                observations = data["observations"]
+                positions = data["positions"]
+                orientations = data["orientations"]
+
+                idx_label = f"{idx:08}"
+                for oidx, observation in enumerate(observations):
+                    observation_label = (
+                        f"{oidx + 1}"  # Leonid is indexing starting from 1
+                    )
+                    # Color image saved using PIL
+                    img = PIL.Image.fromarray(observation["color"][:, :, :3])
+                    filename = os.path.join(
+                        output_dir, f"{idx_label}_{observation_label}.jpeg"
+                    )
+                    img.save(filename)
+                    if generate_depth:
+                        # Depth image as EXR file
+                        filename = os.path.join(
+                            output_dir, f"{idx_label}_{observation_label}_depth.exr"
+                        )
+                        cv2.imwrite(
+                            filename,
+                            observation["depth"],
+                            [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF],
+                        )
+                        # Camera parameters
+                        camera_params = dict(
+                            [
+                                (key, observation[key].tolist())
+                                for key in (
+                                    "camera_intrinsics",
+                                    "R_cam2world",
+                                    "t_cam2world",
+                                )
+                            ]
+                        )
+                        filename = os.path.join(
+                            output_dir,
+                            f"{idx_label}_{observation_label}_camera_params.json",
+                        )
+                        with open(filename, "w") as f:
+                            json.dump(camera_params, f)
+                metadata["multiviews"][idx_label] = {
+                    "positions": positions.tolist(),
+                    "orientations": orientations.tolist(),
+                    "covisibility_ratios": data["covisibility_ratios"].tolist(),
+                    "valid_fractions": data["valid_fractions"].tolist(),
+                    "pairwise_visibility_ratios": data[
+                        "pairwise_visibility_ratios"
+                    ].tolist(),
+                }
+            except RecursionError:
+                print(
+                    "Recursion error: unable to sample observations for this scene. We will stop there."
+                )
+                break
+
+            # Regularly save a temporary metadata file, in case we need to restart the generation
+            if idx % 10 == 0:
+                with open(metadata_filename, "w") as f:
+                    json.dump(metadata, f)
+
+        # Save metadata
+        with open(metadata_filename, "w") as f:
+            json.dump(metadata, f)
+
+        generator.close()
+    except NoNaviguableSpaceError:
+        pass
+
+
+def create_commandline(scene_data, generate_depth, exist_ok=False):
+    """
+    Create a commandline string to generate a scene.
+    """
+
+    def my_formatting(val):
+        if val is None or val == "":
+            return '""'
+        else:
+            return val
+
+    commandline = f"""python {__file__} --scene {my_formatting(scene_data.scene)} 
+    --scene_dataset_config_file {my_formatting(scene_data.scene_dataset_config_file)} 
+    --navmesh {my_formatting(scene_data.navmesh)} 
+    --output_dir {my_formatting(scene_data.output_dir)} 
+    --generate_depth {int(generate_depth)} 
+    --exist_ok {int(exist_ok)}
+    """
+    commandline = " ".join(commandline.split())
+    return commandline
+
+
+if __name__ == "__main__":
+    os.umask(2)
+
+    parser = argparse.ArgumentParser(
+        description="""Example of use -- listing commands to generate data for scenes available:
+    > python datasets/habitat_sim/generate_multiview_habitat_images.py --list_commands
+    """
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--list_commands", action="store_true", help="list commandlines to run if true"
+    )
+    parser.add_argument("--scene", type=str, default="")
+    parser.add_argument("--scene_dataset_config_file", type=str, default="")
+    parser.add_argument("--navmesh", type=str, default="")
+
+    parser.add_argument("--generate_depth", type=int, default=1)
+    parser.add_argument("--exist_ok", type=int, default=0)
+
+    kwargs = dict(resolution=(256, 256), hfov=60, views_count=2, size=1000)
+
+    args = parser.parse_args()
+    generate_depth = bool(args.generate_depth)
+    exist_ok = bool(args.exist_ok)
+
+    if args.list_commands:
+        # Listing scenes available...
+        scenes_data = list_scenes_available(base_output_dir=args.output_dir)
+
+        for scene_data in scenes_data:
+            print(
+                create_commandline(
+                    scene_data, generate_depth=generate_depth, exist_ok=exist_ok
+                )
+            )
+    else:
+        if args.scene == "" or args.output_dir == "":
+            print("Missing scene or output dir argument!")
+            print(parser.format_help())
+        else:
+            generate_multiview_images_for_scene(
+                scene=args.scene,
+                scene_dataset_config_file=args.scene_dataset_config_file,
+                navmesh=args.navmesh,
+                output_dir=args.output_dir,
+                exist_ok=exist_ok,
+                generate_depth=generate_depth,
+                **kwargs,
+            )
diff --git a/extern/CUT3R/src/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py b/extern/CUT3R/src/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b073407ec169be0674cbd33a1197731ec0dd3be3
--- /dev/null
+++ b/extern/CUT3R/src/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py
@@ -0,0 +1,501 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+import numpy as np
+import quaternion
+import habitat_sim
+import json
+from sklearn.neighbors import NearestNeighbors
+import cv2
+
+# OpenCV to habitat camera convention transformation
+R_OPENCV2HABITAT = np.stack(
+    (habitat_sim.geo.RIGHT, -habitat_sim.geo.UP, habitat_sim.geo.FRONT), axis=0
+)
+R_HABITAT2OPENCV = R_OPENCV2HABITAT.T
+DEG2RAD = np.pi / 180
+
+
+def compute_camera_intrinsics(height, width, hfov):
+    f = width / 2 / np.tan(hfov / 2 * np.pi / 180)
+    cu, cv = width / 2, height / 2
+    return f, cu, cv
+
+
+def compute_camera_pose_opencv_convention(camera_position, camera_orientation):
+    R_cam2world = quaternion.as_rotation_matrix(camera_orientation) @ R_OPENCV2HABITAT
+    t_cam2world = np.asarray(camera_position)
+    return R_cam2world, t_cam2world
+
+
+def compute_pointmap(depthmap, hfov):
+    """Compute a HxWx3 pointmap in camera frame from a HxW depth map."""
+    height, width = depthmap.shape
+    f, cu, cv = compute_camera_intrinsics(height, width, hfov)
+    # Cast depth map to point
+    z_cam = depthmap
+    u, v = np.meshgrid(range(width), range(height))
+    x_cam = (u - cu) / f * z_cam
+    y_cam = (v - cv) / f * z_cam
+    X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1)
+    return X_cam
+
+
+def compute_pointcloud(depthmap, hfov, camera_position, camera_rotation):
+    """Return a 3D point cloud corresponding to valid pixels of the depth map"""
+    R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(
+        camera_position, camera_rotation
+    )
+
+    X_cam = compute_pointmap(depthmap=depthmap, hfov=hfov)
+    valid_mask = X_cam[:, :, 2] != 0.0
+
+    X_cam = X_cam.reshape(-1, 3)[valid_mask.flatten()]
+    X_world = X_cam @ R_cam2world.T + t_cam2world.reshape(1, 3)
+    return X_world
+
+
+def compute_pointcloud_overlaps_scikit(
+    pointcloud1, pointcloud2, distance_threshold, compute_symmetric=False
+):
+    """
+    Compute 'overlapping' metrics based on a distance threshold between two point clouds.
+    """
+    nbrs = NearestNeighbors(n_neighbors=1, algorithm="kd_tree").fit(pointcloud2)
+    distances, indices = nbrs.kneighbors(pointcloud1)
+    intersection1 = np.count_nonzero(distances.flatten() < distance_threshold)
+
+    data = {"intersection1": intersection1, "size1": len(pointcloud1)}
+    if compute_symmetric:
+        nbrs = NearestNeighbors(n_neighbors=1, algorithm="kd_tree").fit(pointcloud1)
+        distances, indices = nbrs.kneighbors(pointcloud2)
+        intersection2 = np.count_nonzero(distances.flatten() < distance_threshold)
+        data["intersection2"] = intersection2
+        data["size2"] = len(pointcloud2)
+
+    return data
+
+
+def _append_camera_parameters(observation, hfov, camera_location, camera_rotation):
+    """
+    Add camera parameters to the observation dictionnary produced by Habitat-Sim
+    In-place modifications.
+    """
+    R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(
+        camera_location, camera_rotation
+    )
+    height, width = observation["depth"].shape
+    f, cu, cv = compute_camera_intrinsics(height, width, hfov)
+    K = np.asarray([[f, 0, cu], [0, f, cv], [0, 0, 1.0]])
+    observation["camera_intrinsics"] = K
+    observation["t_cam2world"] = t_cam2world
+    observation["R_cam2world"] = R_cam2world
+
+
+def look_at(eye, center, up, return_cam2world=True):
+    """
+    Return camera pose looking at a given center point.
+    Analogous of gluLookAt function, using OpenCV camera convention.
+    """
+    z = center - eye
+    z /= np.linalg.norm(z, axis=-1, keepdims=True)
+    y = -up
+    y = y - np.sum(y * z, axis=-1, keepdims=True) * z
+    y /= np.linalg.norm(y, axis=-1, keepdims=True)
+    x = np.cross(y, z, axis=-1)
+
+    if return_cam2world:
+        R = np.stack((x, y, z), axis=-1)
+        t = eye
+    else:
+        # World to camera transformation
+        # Transposed matrix
+        R = np.stack((x, y, z), axis=-2)
+        t = -np.einsum("...ij, ...j", R, eye)
+    return R, t
+
+
+def look_at_for_habitat(eye, center, up, return_cam2world=True):
+    R, t = look_at(eye, center, up)
+    orientation = quaternion.from_rotation_matrix(R @ R_OPENCV2HABITAT.T)
+    return orientation, t
+
+
+def generate_orientation_noise(pan_range, tilt_range, roll_range):
+    return (
+        quaternion.from_rotation_vector(
+            np.random.uniform(*pan_range) * DEG2RAD * habitat_sim.geo.UP
+        )
+        * quaternion.from_rotation_vector(
+            np.random.uniform(*tilt_range) * DEG2RAD * habitat_sim.geo.RIGHT
+        )
+        * quaternion.from_rotation_vector(
+            np.random.uniform(*roll_range) * DEG2RAD * habitat_sim.geo.FRONT
+        )
+    )
+
+
+class NoNaviguableSpaceError(RuntimeError):
+    def __init__(self, *args):
+        super().__init__(*args)
+
+
+class MultiviewHabitatSimGenerator:
+    def __init__(
+        self,
+        scene,
+        navmesh,
+        scene_dataset_config_file,
+        resolution=(240, 320),
+        views_count=2,
+        hfov=60,
+        gpu_id=0,
+        size=10000,
+        minimum_covisibility=0.5,
+        transform=None,
+    ):
+        self.scene = scene
+        self.navmesh = navmesh
+        self.scene_dataset_config_file = scene_dataset_config_file
+        self.resolution = resolution
+        self.views_count = views_count
+        assert self.views_count >= 1
+        self.hfov = hfov
+        self.gpu_id = gpu_id
+        self.size = size
+        self.transform = transform
+
+        # Noise added to camera orientation
+        self.pan_range = (-3, 3)
+        self.tilt_range = (-10, 10)
+        self.roll_range = (-5, 5)
+
+        # Height range to sample cameras
+        self.height_range = (1.2, 1.8)
+
+        # Random steps between the camera views
+        self.random_steps_count = 5
+        self.random_step_variance = 2.0
+
+        # Minimum fraction of the scene which should be valid (well defined depth)
+        self.minimum_valid_fraction = 0.7
+
+        # Distance threshold to see  to select pairs
+        self.distance_threshold = 0.05
+        # Minimum IoU of a view point cloud with respect to the reference view to be kept.
+        self.minimum_covisibility = minimum_covisibility
+
+        # Maximum number of retries.
+        self.max_attempts_count = 100
+
+        self.seed = None
+        self._lazy_initialization()
+
+    def _lazy_initialization(self):
+        # Lazy random seeding and instantiation of the simulator to deal with multiprocessing properly
+        if self.seed == None:
+            # Re-seed numpy generator
+            np.random.seed()
+            self.seed = np.random.randint(2**32 - 1)
+            sim_cfg = habitat_sim.SimulatorConfiguration()
+            sim_cfg.scene_id = self.scene
+            if (
+                self.scene_dataset_config_file is not None
+                and self.scene_dataset_config_file != ""
+            ):
+                sim_cfg.scene_dataset_config_file = self.scene_dataset_config_file
+            sim_cfg.random_seed = self.seed
+            sim_cfg.load_semantic_mesh = False
+            sim_cfg.gpu_device_id = self.gpu_id
+
+            depth_sensor_spec = habitat_sim.CameraSensorSpec()
+            depth_sensor_spec.uuid = "depth"
+            depth_sensor_spec.sensor_type = habitat_sim.SensorType.DEPTH
+            depth_sensor_spec.resolution = self.resolution
+            depth_sensor_spec.hfov = self.hfov
+            depth_sensor_spec.position = [0.0, 0.0, 0]
+            depth_sensor_spec.orientation
+
+            rgb_sensor_spec = habitat_sim.CameraSensorSpec()
+            rgb_sensor_spec.uuid = "color"
+            rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR
+            rgb_sensor_spec.resolution = self.resolution
+            rgb_sensor_spec.hfov = self.hfov
+            rgb_sensor_spec.position = [0.0, 0.0, 0]
+            agent_cfg = habitat_sim.agent.AgentConfiguration(
+                sensor_specifications=[rgb_sensor_spec, depth_sensor_spec]
+            )
+
+            cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])
+            self.sim = habitat_sim.Simulator(cfg)
+            if self.navmesh is not None and self.navmesh != "":
+                # Use pre-computed navmesh when available (usually better than those generated automatically)
+                self.sim.pathfinder.load_nav_mesh(self.navmesh)
+
+            if not self.sim.pathfinder.is_loaded:
+                # Try to compute a navmesh
+                navmesh_settings = habitat_sim.NavMeshSettings()
+                navmesh_settings.set_defaults()
+                self.sim.recompute_navmesh(self.sim.pathfinder, navmesh_settings, True)
+
+            # Ensure that the navmesh is not empty
+            if not self.sim.pathfinder.is_loaded:
+                raise NoNaviguableSpaceError(
+                    f"No naviguable location (scene: {self.scene} -- navmesh: {self.navmesh})"
+                )
+
+            self.agent = self.sim.initialize_agent(agent_id=0)
+
+    def close(self):
+        self.sim.close()
+
+    def __del__(self):
+        self.sim.close()
+
+    def __len__(self):
+        return self.size
+
+    def sample_random_viewpoint(self):
+        """Sample a random viewpoint using the navmesh"""
+        nav_point = self.sim.pathfinder.get_random_navigable_point()
+
+        # Sample a random viewpoint height
+        viewpoint_height = np.random.uniform(*self.height_range)
+        viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP
+        viewpoint_orientation = quaternion.from_rotation_vector(
+            np.random.uniform(0, 2 * np.pi) * habitat_sim.geo.UP
+        ) * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range)
+        return viewpoint_position, viewpoint_orientation, nav_point
+
+    def sample_other_random_viewpoint(self, observed_point, nav_point):
+        """Sample a random viewpoint close to an existing one, using the navmesh and a reference observed point."""
+        other_nav_point = nav_point
+
+        walk_directions = self.random_step_variance * np.asarray([1, 0, 1])
+        for i in range(self.random_steps_count):
+            temp = self.sim.pathfinder.snap_point(
+                other_nav_point + walk_directions * np.random.normal(size=3)
+            )
+            # Snapping may return nan when it fails
+            if not np.isnan(temp[0]):
+                other_nav_point = temp
+
+        other_viewpoint_height = np.random.uniform(*self.height_range)
+        other_viewpoint_position = (
+            other_nav_point + other_viewpoint_height * habitat_sim.geo.UP
+        )
+
+        # Set viewing direction towards the central point
+        rotation, position = look_at_for_habitat(
+            eye=other_viewpoint_position,
+            center=observed_point,
+            up=habitat_sim.geo.UP,
+            return_cam2world=True,
+        )
+        rotation = rotation * generate_orientation_noise(
+            self.pan_range, self.tilt_range, self.roll_range
+        )
+        return position, rotation, other_nav_point
+
+    def is_other_pointcloud_overlapping(self, ref_pointcloud, other_pointcloud):
+        """Check if a viewpoint is valid and overlaps significantly with a reference one."""
+        # Observation
+        pixels_count = self.resolution[0] * self.resolution[1]
+        valid_fraction = len(other_pointcloud) / pixels_count
+        assert valid_fraction <= 1.0 and valid_fraction >= 0.0
+        overlap = compute_pointcloud_overlaps_scikit(
+            ref_pointcloud,
+            other_pointcloud,
+            self.distance_threshold,
+            compute_symmetric=True,
+        )
+        covisibility = min(
+            overlap["intersection1"] / pixels_count,
+            overlap["intersection2"] / pixels_count,
+        )
+        is_valid = (valid_fraction >= self.minimum_valid_fraction) and (
+            covisibility >= self.minimum_covisibility
+        )
+        return is_valid, valid_fraction, covisibility
+
+    def is_other_viewpoint_overlapping(
+        self, ref_pointcloud, observation, position, rotation
+    ):
+        """Check if a viewpoint is valid and overlaps significantly with a reference one."""
+        # Observation
+        other_pointcloud = compute_pointcloud(
+            observation["depth"], self.hfov, position, rotation
+        )
+        return self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud)
+
+    def render_viewpoint(self, viewpoint_position, viewpoint_orientation):
+        agent_state = habitat_sim.AgentState()
+        agent_state.position = viewpoint_position
+        agent_state.rotation = viewpoint_orientation
+        self.agent.set_state(agent_state)
+        viewpoint_observations = self.sim.get_sensor_observations(agent_ids=0)
+        _append_camera_parameters(
+            viewpoint_observations, self.hfov, viewpoint_position, viewpoint_orientation
+        )
+        return viewpoint_observations
+
+    def __getitem__(self, useless_idx):
+        ref_position, ref_orientation, nav_point = self.sample_random_viewpoint()
+        ref_observations = self.render_viewpoint(ref_position, ref_orientation)
+        # Extract point cloud
+        ref_pointcloud = compute_pointcloud(
+            depthmap=ref_observations["depth"],
+            hfov=self.hfov,
+            camera_position=ref_position,
+            camera_rotation=ref_orientation,
+        )
+
+        pixels_count = self.resolution[0] * self.resolution[1]
+        ref_valid_fraction = len(ref_pointcloud) / pixels_count
+        assert ref_valid_fraction <= 1.0 and ref_valid_fraction >= 0.0
+        if ref_valid_fraction < self.minimum_valid_fraction:
+            # This should produce a recursion error at some point when something is very wrong.
+            return self[0]
+        # Pick an reference observed point in the point cloud
+        observed_point = np.mean(ref_pointcloud, axis=0)
+
+        # Add the first image as reference
+        viewpoints_observations = [ref_observations]
+        viewpoints_covisibility = [ref_valid_fraction]
+        viewpoints_positions = [ref_position]
+        viewpoints_orientations = [quaternion.as_float_array(ref_orientation)]
+        viewpoints_clouds = [ref_pointcloud]
+        viewpoints_valid_fractions = [ref_valid_fraction]
+
+        for _ in range(self.views_count - 1):
+            # Generate an other viewpoint using some dummy random walk
+            successful_sampling = False
+            for sampling_attempt in range(self.max_attempts_count):
+                position, rotation, _ = self.sample_other_random_viewpoint(
+                    observed_point, nav_point
+                )
+                # Observation
+                other_viewpoint_observations = self.render_viewpoint(position, rotation)
+                other_pointcloud = compute_pointcloud(
+                    other_viewpoint_observations["depth"], self.hfov, position, rotation
+                )
+
+                is_valid, valid_fraction, covisibility = (
+                    self.is_other_pointcloud_overlapping(
+                        ref_pointcloud, other_pointcloud
+                    )
+                )
+                if is_valid:
+                    successful_sampling = True
+                    break
+            if not successful_sampling:
+                print("WARNING: Maximum number of attempts reached.")
+                # Dirty hack, try using a novel original viewpoint
+                return self[0]
+            viewpoints_observations.append(other_viewpoint_observations)
+            viewpoints_covisibility.append(covisibility)
+            viewpoints_positions.append(position)
+            viewpoints_orientations.append(
+                quaternion.as_float_array(rotation)
+            )  # WXYZ convention for the quaternion encoding.
+            viewpoints_clouds.append(other_pointcloud)
+            viewpoints_valid_fractions.append(valid_fraction)
+
+        # Estimate relations between all pairs of images
+        pairwise_visibility_ratios = np.ones(
+            (len(viewpoints_observations), len(viewpoints_observations))
+        )
+        for i in range(len(viewpoints_observations)):
+            pairwise_visibility_ratios[i, i] = viewpoints_valid_fractions[i]
+            for j in range(i + 1, len(viewpoints_observations)):
+                overlap = compute_pointcloud_overlaps_scikit(
+                    viewpoints_clouds[i],
+                    viewpoints_clouds[j],
+                    self.distance_threshold,
+                    compute_symmetric=True,
+                )
+                pairwise_visibility_ratios[i, j] = (
+                    overlap["intersection1"] / pixels_count
+                )
+                pairwise_visibility_ratios[j, i] = (
+                    overlap["intersection2"] / pixels_count
+                )
+
+        # IoU is relative to the image 0
+        data = {
+            "observations": viewpoints_observations,
+            "positions": np.asarray(viewpoints_positions),
+            "orientations": np.asarray(viewpoints_orientations),
+            "covisibility_ratios": np.asarray(viewpoints_covisibility),
+            "valid_fractions": np.asarray(viewpoints_valid_fractions, dtype=float),
+            "pairwise_visibility_ratios": np.asarray(
+                pairwise_visibility_ratios, dtype=float
+            ),
+        }
+
+        if self.transform is not None:
+            data = self.transform(data)
+        return data
+
+    def generate_random_spiral_trajectory(
+        self,
+        images_count=100,
+        max_radius=0.5,
+        half_turns=5,
+        use_constant_orientation=False,
+    ):
+        """
+        Return a list of images corresponding to a spiral trajectory from a random starting point.
+        Useful to generate nice visualisations.
+        Use an even number of half turns to get a nice "C1-continuous" loop effect
+        """
+        ref_position, ref_orientation, navpoint = self.sample_random_viewpoint()
+        ref_observations = self.render_viewpoint(ref_position, ref_orientation)
+        ref_pointcloud = compute_pointcloud(
+            depthmap=ref_observations["depth"],
+            hfov=self.hfov,
+            camera_position=ref_position,
+            camera_rotation=ref_orientation,
+        )
+        pixels_count = self.resolution[0] * self.resolution[1]
+        if len(ref_pointcloud) / pixels_count < self.minimum_valid_fraction:
+            # Dirty hack: ensure that the valid part of the image is significant
+            return self.generate_random_spiral_trajectory(
+                images_count, max_radius, half_turns, use_constant_orientation
+            )
+
+        # Pick an observed point in the point cloud
+        observed_point = np.mean(ref_pointcloud, axis=0)
+        ref_R, ref_t = compute_camera_pose_opencv_convention(
+            ref_position, ref_orientation
+        )
+
+        images = []
+        is_valid = []
+        # Spiral trajectory, use_constant orientation
+        for i, alpha in enumerate(np.linspace(0, 1, images_count)):
+            r = max_radius * np.abs(
+                np.sin(alpha * np.pi)
+            )  # Increase then decrease the radius
+            theta = alpha * half_turns * np.pi
+            x = r * np.cos(theta)
+            y = r * np.sin(theta)
+            z = 0.0
+            position = (
+                ref_position + (ref_R @ np.asarray([x, y, z]).reshape(3, 1)).flatten()
+            )
+            if use_constant_orientation:
+                orientation = ref_orientation
+            else:
+                # trajectory looking at a mean point in front of the ref observation
+                orientation, position = look_at_for_habitat(
+                    eye=position, center=observed_point, up=habitat_sim.geo.UP
+                )
+            observations = self.render_viewpoint(position, orientation)
+            images.append(observations["color"][..., :3])
+            _is_valid, valid_fraction, iou = self.is_other_viewpoint_overlapping(
+                ref_pointcloud, observations, position, orientation
+            )
+            is_valid.append(_is_valid)
+        return images, np.all(is_valid)
diff --git a/extern/CUT3R/src/croco/datasets/habitat_sim/pack_metadata_files.py b/extern/CUT3R/src/croco/datasets/habitat_sim/pack_metadata_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bd8234dfaa491d5f25f7c778406255116a8b392
--- /dev/null
+++ b/extern/CUT3R/src/croco/datasets/habitat_sim/pack_metadata_files.py
@@ -0,0 +1,80 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+"""
+Utility script to pack metadata files of the dataset in order to be able to re-generate it elsewhere.
+"""
+import os
+import glob
+from tqdm import tqdm
+import shutil
+import json
+from datasets.habitat_sim.paths import *
+import argparse
+import collections
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_dir")
+    parser.add_argument("output_dir")
+    args = parser.parse_args()
+
+    input_dirname = args.input_dir
+    output_dirname = args.output_dir
+
+    input_metadata_filenames = glob.iglob(
+        f"{input_dirname}/**/metadata.json", recursive=True
+    )
+
+    images_count = collections.defaultdict(lambda: 0)
+
+    os.makedirs(output_dirname)
+    for input_filename in tqdm(input_metadata_filenames):
+        # Ignore empty files
+        with open(input_filename, "r") as f:
+            original_metadata = json.load(f)
+            if (
+                "multiviews" not in original_metadata
+                or len(original_metadata["multiviews"]) == 0
+            ):
+                print("No views in", input_filename)
+                continue
+
+        relpath = os.path.relpath(input_filename, input_dirname)
+        print(relpath)
+
+        # Copy metadata, while replacing scene paths by generic keys depending on the dataset, for portability.
+        # Data paths are sorted by decreasing length to avoid potential bugs due to paths starting by the same string pattern.
+        scenes_dataset_paths = dict(
+            sorted(SCENES_DATASET.items(), key=lambda x: len(x[1]), reverse=True)
+        )
+        metadata = dict()
+        for key, value in original_metadata.items():
+            if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "":
+                known_path = False
+                for dataset, dataset_path in scenes_dataset_paths.items():
+                    if value.startswith(dataset_path):
+                        value = os.path.join(
+                            dataset, os.path.relpath(value, dataset_path)
+                        )
+                        known_path = True
+                        break
+                if not known_path:
+                    raise KeyError("Unknown path:" + value)
+            metadata[key] = value
+
+        # Compile some general statistics while packing data
+        scene_split = metadata["scene"].split("/")
+        upper_level = (
+            "/".join(scene_split[:2]) if scene_split[0] == "hm3d" else scene_split[0]
+        )
+        images_count[upper_level] += len(metadata["multiviews"])
+
+        output_filename = os.path.join(output_dirname, relpath)
+        os.makedirs(os.path.dirname(output_filename), exist_ok=True)
+        with open(output_filename, "w") as f:
+            json.dump(metadata, f)
+
+    # Print statistics
+    print("Images count:")
+    for upper_level, count in images_count.items():
+        print(f"- {upper_level}: {count}")
diff --git a/extern/CUT3R/src/croco/datasets/habitat_sim/paths.py b/extern/CUT3R/src/croco/datasets/habitat_sim/paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..87389fcff93d220d6f205dc21119da3c56c3abb9
--- /dev/null
+++ b/extern/CUT3R/src/croco/datasets/habitat_sim/paths.py
@@ -0,0 +1,179 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Paths to Habitat-Sim scenes
+"""
+
+import os
+import json
+import collections
+from tqdm import tqdm
+
+
+# Hardcoded path to the different scene datasets
+SCENES_DATASET = {
+    "hm3d": "./data/habitat-sim-data/scene_datasets/hm3d/",
+    "gibson": "./data/habitat-sim-data/scene_datasets/gibson/",
+    "habitat-test-scenes": "./data/habitat-sim/scene_datasets/habitat-test-scenes/",
+    "replica_cad_baked_lighting": "./data/habitat-sim/scene_datasets/replica_cad_baked_lighting/",
+    "replica_cad": "./data/habitat-sim/scene_datasets/replica_cad/",
+    "replica": "./data/habitat-sim/scene_datasets/ReplicaDataset/",
+    "scannet": "./data/habitat-sim/scene_datasets/scannet/",
+}
+
+SceneData = collections.namedtuple(
+    "SceneData", ["scene_dataset_config_file", "scene", "navmesh", "output_dir"]
+)
+
+
+def list_replicacad_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad"]):
+    scene_dataset_config_file = os.path.join(
+        base_path, "replicaCAD.scene_dataset_config.json"
+    )
+    scenes = [f"apt_{i}" for i in range(6)] + ["empty_stage"]
+    navmeshes = [f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + [
+        "empty_stage.navmesh"
+    ]
+    scenes_data = []
+    for idx in range(len(scenes)):
+        output_dir = os.path.join(base_output_dir, "ReplicaCAD", scenes[idx])
+        # Add scene
+        data = SceneData(
+            scene_dataset_config_file=scene_dataset_config_file,
+            scene=scenes[idx] + ".scene_instance.json",
+            navmesh=os.path.join(base_path, navmeshes[idx]),
+            output_dir=output_dir,
+        )
+        scenes_data.append(data)
+    return scenes_data
+
+
+def list_replica_cad_baked_lighting_scenes(
+    base_output_dir, base_path=SCENES_DATASET["replica_cad_baked_lighting"]
+):
+    scene_dataset_config_file = os.path.join(
+        base_path, "replicaCAD_baked.scene_dataset_config.json"
+    )
+    scenes = sum(
+        [[f"Baked_sc{i}_staging_{j:02}" for i in range(5)] for j in range(21)], []
+    )
+    navmeshes = ""  # [f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"]
+    scenes_data = []
+    for idx in range(len(scenes)):
+        output_dir = os.path.join(
+            base_output_dir, "replica_cad_baked_lighting", scenes[idx]
+        )
+        data = SceneData(
+            scene_dataset_config_file=scene_dataset_config_file,
+            scene=scenes[idx],
+            navmesh="",
+            output_dir=output_dir,
+        )
+        scenes_data.append(data)
+    return scenes_data
+
+
+def list_replica_scenes(base_output_dir, base_path):
+    scenes_data = []
+    for scene_id in os.listdir(base_path):
+        scene = os.path.join(base_path, scene_id, "mesh.ply")
+        navmesh = os.path.join(
+            base_path, scene_id, "habitat/mesh_preseg_semantic.navmesh"
+        )  # Not sure if I should use it
+        scene_dataset_config_file = ""
+        output_dir = os.path.join(base_output_dir, scene_id)
+        # Add scene only if it does not exist already, or if exist_ok
+        data = SceneData(
+            scene_dataset_config_file=scene_dataset_config_file,
+            scene=scene,
+            navmesh=navmesh,
+            output_dir=output_dir,
+        )
+        scenes_data.append(data)
+    return scenes_data
+
+
+def list_scenes(base_output_dir, base_path):
+    """
+    Generic method iterating through a base_path folder to find scenes.
+    """
+    scenes_data = []
+    for root, dirs, files in os.walk(base_path, followlinks=True):
+        folder_scenes_data = []
+        for file in files:
+            name, ext = os.path.splitext(file)
+            if ext == ".glb":
+                scene = os.path.join(root, name + ".glb")
+                navmesh = os.path.join(root, name + ".navmesh")
+                if not os.path.exists(navmesh):
+                    navmesh = ""
+                relpath = os.path.relpath(root, base_path)
+                output_dir = os.path.abspath(
+                    os.path.join(base_output_dir, relpath, name)
+                )
+                data = SceneData(
+                    scene_dataset_config_file="",
+                    scene=scene,
+                    navmesh=navmesh,
+                    output_dir=output_dir,
+                )
+                folder_scenes_data.append(data)
+
+        # Specific check for HM3D:
+        # When two meshesxxxx.basis.glb and xxxx.glb are present, use the 'basis' version.
+        basis_scenes = [
+            data.scene[: -len(".basis.glb")]
+            for data in folder_scenes_data
+            if data.scene.endswith(".basis.glb")
+        ]
+        if len(basis_scenes) != 0:
+            folder_scenes_data = [
+                data
+                for data in folder_scenes_data
+                if not (data.scene[: -len(".glb")] in basis_scenes)
+            ]
+
+        scenes_data.extend(folder_scenes_data)
+    return scenes_data
+
+
+def list_scenes_available(base_output_dir, scenes_dataset_paths=SCENES_DATASET):
+    scenes_data = []
+
+    # HM3D
+    for split in ("minival", "train", "val", "examples"):
+        scenes_data += list_scenes(
+            base_output_dir=os.path.join(base_output_dir, f"hm3d/{split}/"),
+            base_path=f"{scenes_dataset_paths['hm3d']}/{split}",
+        )
+
+    # Gibson
+    scenes_data += list_scenes(
+        base_output_dir=os.path.join(base_output_dir, "gibson"),
+        base_path=scenes_dataset_paths["gibson"],
+    )
+
+    # Habitat test scenes (just a few)
+    scenes_data += list_scenes(
+        base_output_dir=os.path.join(base_output_dir, "habitat-test-scenes"),
+        base_path=scenes_dataset_paths["habitat-test-scenes"],
+    )
+
+    # ReplicaCAD (baked lightning)
+    scenes_data += list_replica_cad_baked_lighting_scenes(
+        base_output_dir=base_output_dir
+    )
+
+    # ScanNet
+    scenes_data += list_scenes(
+        base_output_dir=os.path.join(base_output_dir, "scannet"),
+        base_path=scenes_dataset_paths["scannet"],
+    )
+
+    # Replica
+    list_replica_scenes(
+        base_output_dir=os.path.join(base_output_dir, "replica"),
+        base_path=scenes_dataset_paths["replica"],
+    )
+    return scenes_data
diff --git a/extern/CUT3R/src/croco/datasets/pairs_dataset.py b/extern/CUT3R/src/croco/datasets/pairs_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..066bb9510332255edd211f98f2beb6670abff4f9
--- /dev/null
+++ b/extern/CUT3R/src/croco/datasets/pairs_dataset.py
@@ -0,0 +1,162 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+from torch.utils.data import Dataset
+from PIL import Image
+
+from datasets.transforms import get_pair_transforms
+
+
+def load_image(impath):
+    return Image.open(impath)
+
+
+def load_pairs_from_cache_file(fname, root=""):
+    assert os.path.isfile(
+        fname
+    ), "cannot parse pairs from {:s}, file does not exist".format(fname)
+    with open(fname, "r") as fid:
+        lines = fid.read().strip().splitlines()
+    pairs = [
+        (os.path.join(root, l.split()[0]), os.path.join(root, l.split()[1]))
+        for l in lines
+    ]
+    return pairs
+
+
+def load_pairs_from_list_file(fname, root=""):
+    assert os.path.isfile(
+        fname
+    ), "cannot parse pairs from {:s}, file does not exist".format(fname)
+    with open(fname, "r") as fid:
+        lines = fid.read().strip().splitlines()
+    pairs = [
+        (os.path.join(root, l + "_1.jpg"), os.path.join(root, l + "_2.jpg"))
+        for l in lines
+        if not l.startswith("#")
+    ]
+    return pairs
+
+
+def write_cache_file(fname, pairs, root=""):
+    if len(root) > 0:
+        if not root.endswith("/"):
+            root += "/"
+        assert os.path.isdir(root)
+    s = ""
+    for im1, im2 in pairs:
+        if len(root) > 0:
+            assert im1.startswith(root), im1
+            assert im2.startswith(root), im2
+        s += "{:s} {:s}\n".format(im1[len(root) :], im2[len(root) :])
+    with open(fname, "w") as fid:
+        fid.write(s[:-1])
+
+
+def parse_and_cache_all_pairs(dname, data_dir="./data/"):
+    if dname == "habitat_release":
+        dirname = os.path.join(data_dir, "habitat_release")
+        assert os.path.isdir(dirname), (
+            "cannot find folder for habitat_release pairs: " + dirname
+        )
+        cache_file = os.path.join(dirname, "pairs.txt")
+        assert not os.path.isfile(cache_file), (
+            "cache file already exists: " + cache_file
+        )
+
+        print("Parsing pairs for dataset: " + dname)
+        pairs = []
+        for root, dirs, files in os.walk(dirname):
+            if "val" in root:
+                continue
+            dirs.sort()
+            pairs += [
+                (
+                    os.path.join(root, f),
+                    os.path.join(root, f[: -len("_1.jpeg")] + "_2.jpeg"),
+                )
+                for f in sorted(files)
+                if f.endswith("_1.jpeg")
+            ]
+        print("Found {:,} pairs".format(len(pairs)))
+        print("Writing cache to: " + cache_file)
+        write_cache_file(cache_file, pairs, root=dirname)
+
+    else:
+        raise NotImplementedError("Unknown dataset: " + dname)
+
+
+def dnames_to_image_pairs(dnames, data_dir="./data/"):
+    """
+    dnames: list of datasets with image pairs, separated by +
+    """
+    all_pairs = []
+    for dname in dnames.split("+"):
+        if dname == "habitat_release":
+            dirname = os.path.join(data_dir, "habitat_release")
+            assert os.path.isdir(dirname), (
+                "cannot find folder for habitat_release pairs: " + dirname
+            )
+            cache_file = os.path.join(dirname, "pairs.txt")
+            assert os.path.isfile(cache_file), (
+                "cannot find cache file for habitat_release pairs, please first create the cache file, see instructions. "
+                + cache_file
+            )
+            pairs = load_pairs_from_cache_file(cache_file, root=dirname)
+        elif dname in ["ARKitScenes", "MegaDepth", "3DStreetView", "IndoorVL"]:
+            dirname = os.path.join(data_dir, dname + "_crops")
+            assert os.path.isdir(
+                dirname
+            ), "cannot find folder for {:s} pairs: {:s}".format(dname, dirname)
+            list_file = os.path.join(dirname, "listing.txt")
+            assert os.path.isfile(
+                list_file
+            ), "cannot find list file for {:s} pairs, see instructions. {:s}".format(
+                dname, list_file
+            )
+            pairs = load_pairs_from_list_file(list_file, root=dirname)
+        print("  {:s}: {:,} pairs".format(dname, len(pairs)))
+        all_pairs += pairs
+    if "+" in dnames:
+        print(" Total: {:,} pairs".format(len(all_pairs)))
+    return all_pairs
+
+
+class PairsDataset(Dataset):
+
+    def __init__(
+        self, dnames, trfs="", totensor=True, normalize=True, data_dir="./data/"
+    ):
+        super().__init__()
+        self.image_pairs = dnames_to_image_pairs(dnames, data_dir=data_dir)
+        self.transforms = get_pair_transforms(
+            transform_str=trfs, totensor=totensor, normalize=normalize
+        )
+
+    def __len__(self):
+        return len(self.image_pairs)
+
+    def __getitem__(self, index):
+        im1path, im2path = self.image_pairs[index]
+        im1 = load_image(im1path)
+        im2 = load_image(im2path)
+        if self.transforms is not None:
+            im1, im2 = self.transforms(im1, im2)
+        return im1, im2
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        prog="Computing and caching list of pairs for a given dataset"
+    )
+    parser.add_argument(
+        "--data_dir", default="./data/", type=str, help="path where data are stored"
+    )
+    parser.add_argument(
+        "--dataset", default="habitat_release", type=str, help="name of the dataset"
+    )
+    args = parser.parse_args()
+    parse_and_cache_all_pairs(dname=args.dataset, data_dir=args.data_dir)
diff --git a/extern/CUT3R/src/croco/datasets/transforms.py b/extern/CUT3R/src/croco/datasets/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dc89dd1092293f63035afd70e9ef9f907696f44
--- /dev/null
+++ b/extern/CUT3R/src/croco/datasets/transforms.py
@@ -0,0 +1,135 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import torch
+import torchvision.transforms
+import torchvision.transforms.functional as F
+
+# "Pair": apply a transform on a pair
+# "Both": apply the exact same transform to both images
+
+
+class ComposePair(torchvision.transforms.Compose):
+    def __call__(self, img1, img2):
+        for t in self.transforms:
+            img1, img2 = t(img1, img2)
+        return img1, img2
+
+
+class NormalizeBoth(torchvision.transforms.Normalize):
+    def forward(self, img1, img2):
+        img1 = super().forward(img1)
+        img2 = super().forward(img2)
+        return img1, img2
+
+
+class ToTensorBoth(torchvision.transforms.ToTensor):
+    def __call__(self, img1, img2):
+        img1 = super().__call__(img1)
+        img2 = super().__call__(img2)
+        return img1, img2
+
+
+class RandomCropPair(torchvision.transforms.RandomCrop):
+    # the crop will be intentionally different for the two images with this class
+    def forward(self, img1, img2):
+        img1 = super().forward(img1)
+        img2 = super().forward(img2)
+        return img1, img2
+
+
+class ColorJitterPair(torchvision.transforms.ColorJitter):
+    # can be symmetric (same for both images) or assymetric (different jitter params for each image) depending on assymetric_prob
+    def __init__(self, assymetric_prob, **kwargs):
+        super().__init__(**kwargs)
+        self.assymetric_prob = assymetric_prob
+
+    def jitter_one(
+        self,
+        img,
+        fn_idx,
+        brightness_factor,
+        contrast_factor,
+        saturation_factor,
+        hue_factor,
+    ):
+        for fn_id in fn_idx:
+            if fn_id == 0 and brightness_factor is not None:
+                img = F.adjust_brightness(img, brightness_factor)
+            elif fn_id == 1 and contrast_factor is not None:
+                img = F.adjust_contrast(img, contrast_factor)
+            elif fn_id == 2 and saturation_factor is not None:
+                img = F.adjust_saturation(img, saturation_factor)
+            elif fn_id == 3 and hue_factor is not None:
+                img = F.adjust_hue(img, hue_factor)
+        return img
+
+    def forward(self, img1, img2):
+
+        fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = (
+            self.get_params(self.brightness, self.contrast, self.saturation, self.hue)
+        )
+        img1 = self.jitter_one(
+            img1,
+            fn_idx,
+            brightness_factor,
+            contrast_factor,
+            saturation_factor,
+            hue_factor,
+        )
+        if torch.rand(1) < self.assymetric_prob:  # assymetric:
+            (
+                fn_idx,
+                brightness_factor,
+                contrast_factor,
+                saturation_factor,
+                hue_factor,
+            ) = self.get_params(
+                self.brightness, self.contrast, self.saturation, self.hue
+            )
+        img2 = self.jitter_one(
+            img2,
+            fn_idx,
+            brightness_factor,
+            contrast_factor,
+            saturation_factor,
+            hue_factor,
+        )
+        return img1, img2
+
+
+def get_pair_transforms(transform_str, totensor=True, normalize=True):
+    # transform_str is eg    crop224+color
+    trfs = []
+    for s in transform_str.split("+"):
+        if s.startswith("crop"):
+            size = int(s[len("crop") :])
+            trfs.append(RandomCropPair(size))
+        elif s == "acolor":
+            trfs.append(
+                ColorJitterPair(
+                    assymetric_prob=1.0,
+                    brightness=(0.6, 1.4),
+                    contrast=(0.6, 1.4),
+                    saturation=(0.6, 1.4),
+                    hue=0.0,
+                )
+            )
+        elif s == "":  # if transform_str was ""
+            pass
+        else:
+            raise NotImplementedError("Unknown augmentation: " + s)
+
+    if totensor:
+        trfs.append(ToTensorBoth())
+    if normalize:
+        trfs.append(
+            NormalizeBoth(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        )
+
+    if len(trfs) == 0:
+        return None
+    elif len(trfs) == 1:
+        return trfs
+    else:
+        return ComposePair(trfs)
diff --git a/extern/CUT3R/src/croco/interactive_demo.ipynb b/extern/CUT3R/src/croco/interactive_demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..6cfc960af5baac9a69029c29a16eea4e24123a71
--- /dev/null
+++ b/extern/CUT3R/src/croco/interactive_demo.ipynb
@@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Interactive demo of Cross-view Completion."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n",
+    "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import numpy as np\n",
+    "from models.croco import CroCoNet\n",
+    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
+    "import ipywidgets as widgets\n",
+    "import matplotlib.pyplot as plt\n",
+    "import quaternion\n",
+    "import models.masking"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load CroCo model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')\n",
+    "model = CroCoNet( **ckpt.get('croco_kwargs',{}))\n",
+    "msg = model.load_state_dict(ckpt['model'], strict=True)\n",
+    "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n",
+    "device = torch.device('cuda:0' if use_gpu else 'cpu')\n",
+    "model = model.eval()\n",
+    "model = model.to(device=device)\n",
+    "print(msg)\n",
+    "\n",
+    "def process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches=False):\n",
+    "    \"\"\"\n",
+    "    Perform Cross-View completion using two input images, specified using Numpy arrays.\n",
+    "    \"\"\"\n",
+    "    # Replace the mask generator\n",
+    "    model.mask_generator = models.masking.RandomMask(model.patch_embed.num_patches, masking_ratio)\n",
+    "\n",
+    "    # ImageNet-1k color normalization\n",
+    "    imagenet_mean = torch.as_tensor([0.485, 0.456, 0.406]).reshape(1,3,1,1).to(device)\n",
+    "    imagenet_std = torch.as_tensor([0.229, 0.224, 0.225]).reshape(1,3,1,1).to(device)\n",
+    "\n",
+    "    normalize_input_colors = True\n",
+    "    is_output_normalized = True\n",
+    "    with torch.no_grad():\n",
+    "        # Cast data to torch\n",
+    "        target_image = (torch.as_tensor(target_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n",
+    "        ref_image = (torch.as_tensor(ref_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n",
+    "\n",
+    "        if normalize_input_colors:\n",
+    "            ref_image = (ref_image - imagenet_mean) / imagenet_std\n",
+    "            target_image = (target_image - imagenet_mean) / imagenet_std\n",
+    "\n",
+    "        out, mask, _ = model(target_image, ref_image)\n",
+    "        # # get target\n",
+    "        if not is_output_normalized:\n",
+    "            predicted_image = model.unpatchify(out)\n",
+    "        else:\n",
+    "            # The output only contains higher order information,\n",
+    "            # we retrieve mean and standard deviation from the actual target image\n",
+    "            patchified = model.patchify(target_image)\n",
+    "            mean = patchified.mean(dim=-1, keepdim=True)\n",
+    "            var = patchified.var(dim=-1, keepdim=True)\n",
+    "            pred_renorm = out * (var + 1.e-6)**.5 + mean\n",
+    "            predicted_image = model.unpatchify(pred_renorm)\n",
+    "\n",
+    "        image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None])\n",
+    "        masked_target_image = (1 - image_masks) * target_image\n",
+    "      \n",
+    "        if not reconstruct_unmasked_patches:\n",
+    "            # Replace unmasked patches by their actual values\n",
+    "            predicted_image = predicted_image * image_masks + masked_target_image\n",
+    "\n",
+    "        # Unapply color normalization\n",
+    "        if normalize_input_colors:\n",
+    "            predicted_image = predicted_image * imagenet_std + imagenet_mean\n",
+    "            masked_target_image = masked_target_image * imagenet_std + imagenet_mean\n",
+    "        \n",
+    "        # Cast to Numpy\n",
+    "        masked_target_image = np.asarray(torch.clamp(masked_target_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n",
+    "        predicted_image = np.asarray(torch.clamp(predicted_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n",
+    "        return masked_target_image, predicted_image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Use the Habitat simulator to render images from arbitrary viewpoints (requires habitat_sim to be installed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"MAGNUM_LOG\"]=\"quiet\"\n",
+    "os.environ[\"HABITAT_SIM_LOG\"]=\"quiet\"\n",
+    "import habitat_sim\n",
+    "\n",
+    "scene = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.glb\"\n",
+    "navmesh = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.navmesh\"\n",
+    "\n",
+    "sim_cfg = habitat_sim.SimulatorConfiguration()\n",
+    "if use_gpu: sim_cfg.gpu_device_id = 0\n",
+    "sim_cfg.scene_id = scene\n",
+    "sim_cfg.load_semantic_mesh = False\n",
+    "rgb_sensor_spec = habitat_sim.CameraSensorSpec()\n",
+    "rgb_sensor_spec.uuid = \"color\"\n",
+    "rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR\n",
+    "rgb_sensor_spec.resolution = (224,224)\n",
+    "rgb_sensor_spec.hfov = 56.56\n",
+    "rgb_sensor_spec.position = [0.0, 0.0, 0.0]\n",
+    "rgb_sensor_spec.orientation = [0, 0, 0]\n",
+    "agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec])\n",
+    "\n",
+    "\n",
+    "cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])\n",
+    "sim = habitat_sim.Simulator(cfg)\n",
+    "if navmesh is not None:\n",
+    "    sim.pathfinder.load_nav_mesh(navmesh)\n",
+    "agent = sim.initialize_agent(agent_id=0)\n",
+    "\n",
+    "def sample_random_viewpoint():\n",
+    "    \"\"\" Sample a random viewpoint using the navmesh \"\"\"\n",
+    "    nav_point = sim.pathfinder.get_random_navigable_point()\n",
+    "    # Sample a random viewpoint height\n",
+    "    viewpoint_height = np.random.uniform(1.0, 1.6)\n",
+    "    viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP\n",
+    "    viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(-np.pi, np.pi) * habitat_sim.geo.UP)\n",
+    "    return viewpoint_position, viewpoint_orientation\n",
+    "\n",
+    "def render_viewpoint(position, orientation):\n",
+    "    agent_state = habitat_sim.AgentState()\n",
+    "    agent_state.position = position\n",
+    "    agent_state.rotation = orientation\n",
+    "    agent.set_state(agent_state)\n",
+    "    viewpoint_observations = sim.get_sensor_observations(agent_ids=0)\n",
+    "    image = viewpoint_observations['color'][:,:,:3]\n",
+    "    image = np.asarray(np.clip(1.5 * np.asarray(image, dtype=float), 0, 255), dtype=np.uint8)\n",
+    "    return image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Sample a random reference view"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ref_position, ref_orientation = sample_random_viewpoint()\n",
+    "ref_image = render_viewpoint(ref_position, ref_orientation)\n",
+    "plt.clf()\n",
+    "fig, axes = plt.subplots(1,1, squeeze=False, num=1)\n",
+    "axes[0,0].imshow(ref_image)\n",
+    "for ax in axes.flatten():\n",
+    "    ax.set_xticks([])\n",
+    "    ax.set_yticks([])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Interactive cross-view completion using CroCo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reconstruct_unmasked_patches = False\n",
+    "\n",
+    "def show_demo(masking_ratio, x, y, z, panorama, elevation):\n",
+    "    R = quaternion.as_rotation_matrix(ref_orientation)\n",
+    "    target_position = ref_position + x * R[:,0] + y * R[:,1] + z * R[:,2]\n",
+    "    target_orientation = (ref_orientation\n",
+    "         * quaternion.from_rotation_vector(-elevation * np.pi/180 * habitat_sim.geo.LEFT) \n",
+    "         * quaternion.from_rotation_vector(-panorama * np.pi/180 * habitat_sim.geo.UP))\n",
+    "    \n",
+    "    ref_image = render_viewpoint(ref_position, ref_orientation)\n",
+    "    target_image = render_viewpoint(target_position, target_orientation)\n",
+    "\n",
+    "    masked_target_image, predicted_image = process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches)\n",
+    "\n",
+    "    fig, axes = plt.subplots(1,4, squeeze=True, dpi=300)\n",
+    "    axes[0].imshow(ref_image)\n",
+    "    axes[0].set_xlabel(\"Reference\")\n",
+    "    axes[1].imshow(masked_target_image)\n",
+    "    axes[1].set_xlabel(\"Masked target\")\n",
+    "    axes[2].imshow(predicted_image)\n",
+    "    axes[2].set_xlabel(\"Reconstruction\")        \n",
+    "    axes[3].imshow(target_image)\n",
+    "    axes[3].set_xlabel(\"Target\")\n",
+    "    for ax in axes.flatten():\n",
+    "        ax.set_xticks([])\n",
+    "        ax.set_yticks([])\n",
+    "\n",
+    "interact(show_demo,\n",
+    "        masking_ratio=widgets.FloatSlider(description='masking', value=0.9, min=0.0, max=1.0),\n",
+    "        x=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+    "        y=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+    "        z=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+    "        panorama=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5),\n",
+    "        elevation=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5));"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.13"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "f9237820cd248d7e07cb4fb9f0e4508a85d642f19d831560c0a4b61f3e907e67"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/extern/CUT3R/src/croco/models/blocks.py b/extern/CUT3R/src/croco/models/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa85a431b44d276e3bba9a33fdfd7097f02bc330
--- /dev/null
+++ b/extern/CUT3R/src/croco/models/blocks.py
@@ -0,0 +1,385 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# Main encoder/decoder blocks
+# --------------------------------------------------------
+# References:
+# timm
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/helpers.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/mlp.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/patch_embed.py
+
+
+import torch
+import torch.nn as nn
+
+from itertools import repeat
+import collections.abc
+from torch.nn.functional import scaled_dot_product_attention
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_2tuple = _ntuple(2)
+
+
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob,3):0.3f}"
+
+
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        bias=True,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        return self.drop2(self.fc2(self.drop1(self.act(self.fc1(x)))))
+
+
+class Attention(nn.Module):
+
+    def __init__(
+        self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope.float() if rope is not None else None
+
+    def forward(self, x, xpos):
+        B, N, C = x.shape
+
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .transpose(1, 3)
+        )
+        q, k, v = [qkv[:, :, i] for i in range(3)]
+        # q,k,v = qkv.unbind(2)  # make torchscript happy (cannot use tensor as tuple)
+
+        q_type = q.dtype
+        k_type = k.dtype
+        if self.rope is not None:
+            q = q.to(torch.float16)
+            k = k.to(torch.float16)
+            with torch.autocast(device_type="cuda", enabled=False):
+                q = self.rope(q, xpos)
+                k = self.rope(k, xpos)
+            q = q.to(q_type)
+            k = k.to(k_type)
+
+        # attn = (q @ k.transpose(-2, -1)) * self.scale
+        # attn = attn.softmax(dim=-1)
+        # attn = self.attn_drop(attn)
+
+        # x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        # x = memory_efficient_attention(query=q.permute(0, 2, 1, 3), key=k.permute(0, 2, 1, 3), value=v.permute(0, 2, 1, 3), p=self.attn_drop.p, scale=self.scale).reshape(B, N, C)
+        x = (
+            scaled_dot_product_attention(
+                query=q, key=k, value=v, dropout_p=self.attn_drop.p, scale=self.scale
+            )
+            .transpose(1, 2)
+            .reshape(B, N, C)
+        )
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        rope=None,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            rope=rope,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+
+    def forward(self, x, xpos):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class CrossAttention(nn.Module):
+
+    def __init__(
+        self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.projq = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projk = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projv = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.rope = rope.float() if rope is not None else None
+
+    def forward(self, query, key, value, qpos, kpos):
+        B, Nq, C = query.shape
+        Nk = key.shape[1]
+        Nv = value.shape[1]
+
+        q = (
+            self.projq(query)
+            .reshape(B, Nq, self.num_heads, C // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+        k = (
+            self.projk(key)
+            .reshape(B, Nk, self.num_heads, C // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+        v = (
+            self.projv(value)
+            .reshape(B, Nv, self.num_heads, C // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+
+        q_type = q.dtype
+        k_type = k.dtype
+        if self.rope is not None:
+            if qpos is not None:
+                q = q.to(torch.float16)
+                with torch.autocast(device_type="cuda", enabled=False):
+                    q = self.rope(q, qpos)
+                q = q.to(q_type)
+
+            if kpos is not None:
+                k = k.to(torch.float16)
+                with torch.autocast(device_type="cuda", enabled=False):
+                    k = self.rope(k, kpos)
+                k = k.to(k_type)
+
+        # attn = (q @ k.transpose(-2, -1)) * self.scale
+        # attn = attn.softmax(dim=-1)
+        # attn = self.attn_drop(attn)
+
+        # x = (attn @ v).transpose(1, 2).reshape(B, Nq, C)
+
+        # x = memory_efficient_attention(query=q.permute(0, 2, 1, 3), key=k.permute(0, 2, 1, 3), value=v.permute(0, 2, 1, 3), p=self.attn_drop.p, scale=self.scale).reshape(B, Nq, C)
+        x = (
+            scaled_dot_product_attention(
+                query=q, key=k, value=v, dropout_p=self.attn_drop.p, scale=self.scale
+            )
+            .transpose(1, 2)
+            .reshape(B, Nq, C)
+        )
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class DecoderBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        norm_mem=True,
+        rope=None,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            rope=rope,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.cross_attn = CrossAttention(
+            dim,
+            rope=rope,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.norm3 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+        self.norm_y = norm_layer(dim) if norm_mem else nn.Identity()
+
+    def forward(self, x, y, xpos, ypos):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        y_ = self.norm_y(y)
+        x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos))
+        x = x + self.drop_path(self.mlp(self.norm3(x)))
+        return x, y
+
+
+# patch embedding
+class PositionGetter(object):
+    """return positions of patches"""
+
+    def __init__(self):
+        self.cache_positions = {}
+
+    def __call__(self, b, h, w, device):
+        if not (h, w) in self.cache_positions:
+            x = torch.arange(w, device=device)
+            y = torch.arange(h, device=device)
+            self.cache_positions[h, w] = torch.cartesian_prod(y, x)  # (h, w, 2)
+        pos = self.cache_positions[h, w].view(1, h * w, 2).expand(b, -1, 2).clone()
+        return pos
+
+
+class PatchEmbed(nn.Module):
+    """just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed"""
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+        self.position_getter = PositionGetter()
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        torch._assert(
+            H == self.img_size[0],
+            f"Input image height ({H}) doesn't match model ({self.img_size[0]}).",
+        )
+        torch._assert(
+            W == self.img_size[1],
+            f"Input image width ({W}) doesn't match model ({self.img_size[1]}).",
+        )
+        x = self.proj(x)
+        pos = self.position_getter(B, x.size(2), x.size(3), x.device)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x, pos
+
+    def _init_weights(self):
+        w = self.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
diff --git a/extern/CUT3R/src/croco/models/criterion.py b/extern/CUT3R/src/croco/models/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..af94f572499c976ad9cfd87d4728b8b517cdfd39
--- /dev/null
+++ b/extern/CUT3R/src/croco/models/criterion.py
@@ -0,0 +1,38 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Criterion to train CroCo
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# --------------------------------------------------------
+
+import torch
+
+
+class MaskedMSE(torch.nn.Module):
+
+    def __init__(self, norm_pix_loss=False, masked=True):
+        """
+        norm_pix_loss: normalize each patch by their pixel mean and variance
+        masked: compute loss over the masked patches only
+        """
+        super().__init__()
+        self.norm_pix_loss = norm_pix_loss
+        self.masked = masked
+
+    def forward(self, pred, mask, target):
+
+        if self.norm_pix_loss:
+            mean = target.mean(dim=-1, keepdim=True)
+            var = target.var(dim=-1, keepdim=True)
+            target = (target - mean) / (var + 1.0e-6) ** 0.5
+
+        loss = (pred - target) ** 2
+        loss = loss.mean(dim=-1)  # [N, L], mean loss per patch
+        if self.masked:
+            loss = (loss * mask).sum() / mask.sum()  # mean loss on masked patches
+        else:
+            loss = loss.mean()  # mean loss
+        return loss
diff --git a/extern/CUT3R/src/croco/models/croco.py b/extern/CUT3R/src/croco/models/croco.py
new file mode 100644
index 0000000000000000000000000000000000000000..64b2410e9b52ab34bc66f1d7d768d0e91c8cf30b
--- /dev/null
+++ b/extern/CUT3R/src/croco/models/croco.py
@@ -0,0 +1,330 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# CroCo model during pretraining
+# --------------------------------------------------------
+
+
+import torch
+import torch.nn as nn
+
+torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
+from functools import partial
+
+from models.blocks import Block, DecoderBlock, PatchEmbed
+from models.pos_embed import get_2d_sincos_pos_embed, RoPE2D
+from models.masking import RandomMask
+
+from transformers import PretrainedConfig
+from transformers import PreTrainedModel
+
+
+class CrocoConfig(PretrainedConfig):
+    model_type = "croco"
+
+    def __init__(
+        self,
+        img_size=224,  # input image size
+        patch_size=16,  # patch_size
+        mask_ratio=0.9,  # ratios of masked tokens
+        enc_embed_dim=768,  # encoder feature dimension
+        enc_depth=12,  # encoder depth
+        enc_num_heads=12,  # encoder number of heads in the transformer block
+        dec_embed_dim=512,  # decoder feature dimension
+        dec_depth=8,  # decoder depth
+        dec_num_heads=16,  # decoder number of heads in the transformer block
+        mlp_ratio=4,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        norm_im2_in_dec=True,  # whether to apply normalization of the 'memory' = (second image) in the decoder
+        pos_embed="cosine",  # positional embedding (either cosine or RoPE100)
+    ):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.mask_ratio = mask_ratio
+        self.enc_embed_dim = enc_embed_dim
+        self.enc_depth = enc_depth
+        self.enc_num_heads = enc_num_heads
+        self.dec_embed_dim = dec_embed_dim
+        self.dec_depth = dec_depth
+        self.dec_num_heads = dec_num_heads
+        self.mlp_ratio = mlp_ratio
+        self.norm_layer = norm_layer
+        self.norm_im2_in_dec = norm_im2_in_dec
+        self.pos_embed = pos_embed
+
+
+class CroCoNet(PreTrainedModel):
+
+    config_class = CrocoConfig
+    base_model_prefix = "croco"
+
+    def __init__(self, config: CrocoConfig):
+
+        super().__init__(config)
+
+        # patch embeddings  (with initialization done as in MAE)
+        self._set_patch_embed(config.img_size, config.patch_size, config.enc_embed_dim)
+
+        # mask generations
+        self._set_mask_generator(self.patch_embed.num_patches, config.mask_ratio)
+
+        self.pos_embed = config.pos_embed
+        if config.pos_embed == "cosine":
+            # positional embedding of the encoder
+            enc_pos_embed = get_2d_sincos_pos_embed(
+                config.enc_embed_dim,
+                int(self.patch_embed.num_patches**0.5),
+                n_cls_token=0,
+            )
+            self.register_buffer(
+                "enc_pos_embed", torch.from_numpy(enc_pos_embed).float()
+            )
+            # positional embedding of the decoder
+            dec_pos_embed = get_2d_sincos_pos_embed(
+                config.dec_embed_dim,
+                int(self.patch_embed.num_patches**0.5),
+                n_cls_token=0,
+            )
+            self.register_buffer(
+                "dec_pos_embed", torch.from_numpy(dec_pos_embed).float()
+            )
+            # pos embedding in each block
+            self.rope = None  # nothing for cosine
+        elif config.pos_embed.startswith("RoPE"):  # eg RoPE100
+            self.enc_pos_embed = None  # nothing to add in the encoder with RoPE
+            self.dec_pos_embed = None  # nothing to add in the decoder with RoPE
+            if RoPE2D is None:
+                raise ImportError(
+                    "Cannot find cuRoPE2D, please install it following the README instructions"
+                )
+            freq = float(config.pos_embed[len("RoPE") :])
+            self.rope = RoPE2D(freq=freq)
+        else:
+            raise NotImplementedError("Unknown pos_embed " + config.pos_embed)
+
+        # transformer for the encoder
+        self.enc_depth = config.enc_depth
+        self.enc_embed_dim = config.enc_embed_dim
+        self.enc_blocks = nn.ModuleList(
+            [
+                Block(
+                    config.enc_embed_dim,
+                    config.enc_num_heads,
+                    config.mlp_ratio,
+                    qkv_bias=True,
+                    norm_layer=config.norm_layer,
+                    rope=self.rope,
+                )
+                for i in range(config.enc_depth)
+            ]
+        )
+        self.enc_norm = config.norm_layer(config.enc_embed_dim)
+
+        # masked tokens
+        # self._set_mask_token(config.dec_embed_dim)
+        self.mask_token = None
+
+        # decoder
+        self._set_decoder(
+            config.enc_embed_dim,
+            config.dec_embed_dim,
+            config.dec_num_heads,
+            config.dec_depth,
+            config.mlp_ratio,
+            config.norm_layer,
+            config.norm_im2_in_dec,
+        )
+
+        # prediction head
+        self._set_prediction_head(config.dec_embed_dim, config.patch_size)
+
+        # initializer weights
+        self.initialize_weights()
+
+    def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
+        self.patch_embed = PatchEmbed(img_size, patch_size, 3, enc_embed_dim)
+
+    def _set_mask_generator(self, num_patches, mask_ratio):
+        self.mask_generator = RandomMask(num_patches, mask_ratio)
+
+    def _set_mask_token(self, dec_embed_dim):
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, dec_embed_dim))
+
+    def _set_decoder(
+        self,
+        enc_embed_dim,
+        dec_embed_dim,
+        dec_num_heads,
+        dec_depth,
+        mlp_ratio,
+        norm_layer,
+        norm_im2_in_dec,
+    ):
+        self.dec_depth = dec_depth
+        self.dec_embed_dim = dec_embed_dim
+        # transfer from encoder to decoder
+        self.decoder_embed = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True)
+        # transformer for the decoder
+        self.dec_blocks = nn.ModuleList(
+            [
+                DecoderBlock(
+                    dec_embed_dim,
+                    dec_num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=True,
+                    norm_layer=norm_layer,
+                    norm_mem=norm_im2_in_dec,
+                    rope=self.rope,
+                )
+                for i in range(dec_depth)
+            ]
+        )
+        # final norm layer
+        self.dec_norm = norm_layer(dec_embed_dim)
+
+    def _set_prediction_head(self, dec_embed_dim, patch_size):
+        self.prediction_head = nn.Linear(dec_embed_dim, patch_size**2 * 3, bias=True)
+
+    def initialize_weights(self):
+        # patch embed
+        self.patch_embed._init_weights()
+        # mask tokens
+        if self.mask_token is not None:
+            torch.nn.init.normal_(self.mask_token, std=0.02)
+        # linears and layer norms
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def _encode_image(self, image, do_mask=False, return_all_blocks=False):
+        """
+        image has B x 3 x img_size x img_size
+        do_mask: whether to perform masking or not
+        return_all_blocks: if True, return the features at the end of every block
+                           instead of just the features from the last block (eg for some prediction heads)
+        """
+        # embed the image into patches  (x has size B x Npatches x C)
+        # and get position if each return patch (pos has size B x Npatches x 2)
+        x, pos = self.patch_embed(image)
+        # add positional embedding without cls token
+        if self.enc_pos_embed is not None:
+            x = x + self.enc_pos_embed[None, ...]
+        # apply masking
+        B, N, C = x.size()
+        if do_mask:
+            masks = self.mask_generator(x)
+            x = x[~masks].view(B, -1, C)
+            posvis = pos[~masks].view(B, -1, 2)
+        else:
+            B, N, C = x.size()
+            masks = torch.zeros((B, N), dtype=bool)
+            posvis = pos
+        # now apply the transformer encoder and normalization
+        if return_all_blocks:
+            out = []
+            for blk in self.enc_blocks:
+                x = blk(x, posvis)
+                out.append(x)
+            out[-1] = self.enc_norm(out[-1])
+            return out, pos, masks
+        else:
+            for blk in self.enc_blocks:
+                x = blk(x, posvis)
+            x = self.enc_norm(x)
+            return x, pos, masks
+
+    def _decoder(self, feat1, pos1, masks1, feat2, pos2, return_all_blocks=False):
+        """
+        return_all_blocks: if True, return the features at the end of every block
+                           instead of just the features from the last block (eg for some prediction heads)
+
+        masks1 can be None => assume image1 fully visible
+        """
+        # encoder to decoder layer
+        visf1 = self.decoder_embed(feat1)
+        f2 = self.decoder_embed(feat2)
+        # append masked tokens to the sequence
+        B, Nenc, C = visf1.size()
+        if masks1 is None:  # downstreams
+            f1_ = visf1
+        else:  # pretraining
+            Ntotal = masks1.size(1)
+            f1_ = self.mask_token.repeat(B, Ntotal, 1).to(dtype=visf1.dtype)
+            f1_[~masks1] = visf1.view(B * Nenc, C)
+        # add positional embedding
+        if self.dec_pos_embed is not None:
+            f1_ = f1_ + self.dec_pos_embed
+            f2 = f2 + self.dec_pos_embed
+        # apply Transformer blocks
+        out = f1_
+        out2 = f2
+        if return_all_blocks:
+            _out, out = out, []
+            for blk in self.dec_blocks:
+                _out, out2 = blk(_out, out2, pos1, pos2)
+                out.append(_out)
+            out[-1] = self.dec_norm(out[-1])
+        else:
+            for blk in self.dec_blocks:
+                out, out2 = blk(out, out2, pos1, pos2)
+            out = self.dec_norm(out)
+        return out
+
+    def patchify(self, imgs):
+        """
+        imgs: (B, 3, H, W)
+        x: (B, L, patch_size**2 *3)
+        """
+        p = self.patch_embed.patch_size[0]
+        assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
+
+        h = w = imgs.shape[2] // p
+        x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
+        x = torch.einsum("nchpwq->nhwpqc", x)
+        x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3))
+
+        return x
+
+    def unpatchify(self, x, channels=3):
+        """
+        x: (N, L, patch_size**2 *channels)
+        imgs: (N, 3, H, W)
+        """
+        patch_size = self.patch_embed.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, patch_size, patch_size, channels))
+        x = torch.einsum("nhwpqc->nchpwq", x)
+        imgs = x.reshape(shape=(x.shape[0], channels, h * patch_size, h * patch_size))
+        return imgs
+
+    # def forward(self, img1, img2):
+    # """
+    # img1: tensor of size B x 3 x img_size x img_size
+    # img2: tensor of size B x 3 x img_size x img_size
+
+    # out will be    B x N x (3*patch_size*patch_size)
+    # masks are also returned as B x N just in case
+    # """
+    # # encoder of the masked first image
+    # feat1, pos1, mask1 = self._encode_image(img1, do_mask=True)
+    # # encoder of the second image
+    # feat2, pos2, _ = self._encode_image(img2, do_mask=False)
+    # # decoder
+    # decfeat = self._decoder(feat1, pos1, mask1, feat2, pos2)
+    # # prediction head
+    # out = self.prediction_head(decfeat)
+    # # get target
+    # target = self.patchify(img1)
+    # return out, mask1, target
diff --git a/extern/CUT3R/src/croco/models/croco_downstream.py b/extern/CUT3R/src/croco/models/croco_downstream.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd59dca45d403c16d60610640b4156b151f46c9b
--- /dev/null
+++ b/extern/CUT3R/src/croco/models/croco_downstream.py
@@ -0,0 +1,141 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# CroCo model for downstream tasks
+# --------------------------------------------------------
+
+import torch
+
+from .croco import CroCoNet
+
+
+def croco_args_from_ckpt(ckpt):
+    if "croco_kwargs" in ckpt:  # CroCo v2 released models
+        return ckpt["croco_kwargs"]
+    elif "args" in ckpt and hasattr(
+        ckpt["args"], "model"
+    ):  # pretrained using the official code release
+        s = ckpt[
+            "args"
+        ].model  # eg "CroCoNet(enc_embed_dim=1024, enc_num_heads=16, enc_depth=24)"
+        assert s.startswith("CroCoNet(")
+        return eval(
+            "dict" + s[len("CroCoNet") :]
+        )  # transform it into the string of a dictionary and evaluate it
+    else:  # CroCo v1 released models
+        return dict()
+
+
+class CroCoDownstreamMonocularEncoder(CroCoNet):
+
+    def __init__(self, head, **kwargs):
+        """Build network for monocular downstream task, only using the encoder.
+        It takes an extra argument head, that is called with the features
+          and a dictionary img_info containing 'width' and 'height' keys
+        The head is setup with the croconet arguments in this init function
+        NOTE: It works by *calling super().__init__() but with redefined setters
+
+        """
+        super(CroCoDownstreamMonocularEncoder, self).__init__(**kwargs)
+        head.setup(self)
+        self.head = head
+
+    def _set_mask_generator(self, *args, **kwargs):
+        """No mask generator"""
+        return
+
+    def _set_mask_token(self, *args, **kwargs):
+        """No mask token"""
+        self.mask_token = None
+        return
+
+    def _set_decoder(self, *args, **kwargs):
+        """No decoder"""
+        return
+
+    def _set_prediction_head(self, *args, **kwargs):
+        """No 'prediction head' for downstream tasks."""
+        return
+
+    def forward(self, img):
+        """
+        img if of size batch_size x 3 x h x w
+        """
+        B, C, H, W = img.size()
+        img_info = {"height": H, "width": W}
+        need_all_layers = (
+            hasattr(self.head, "return_all_blocks") and self.head.return_all_blocks
+        )
+        out, _, _ = self._encode_image(
+            img, do_mask=False, return_all_blocks=need_all_layers
+        )
+        return self.head(out, img_info)
+
+
+class CroCoDownstreamBinocular(CroCoNet):
+
+    def __init__(self, head, **kwargs):
+        """Build network for binocular downstream task
+        It takes an extra argument head, that is called with the features
+          and a dictionary img_info containing 'width' and 'height' keys
+        The head is setup with the croconet arguments in this init function
+        """
+        super(CroCoDownstreamBinocular, self).__init__(**kwargs)
+        head.setup(self)
+        self.head = head
+
+    def _set_mask_generator(self, *args, **kwargs):
+        """No mask generator"""
+        return
+
+    def _set_mask_token(self, *args, **kwargs):
+        """No mask token"""
+        self.mask_token = None
+        return
+
+    def _set_prediction_head(self, *args, **kwargs):
+        """No prediction head for downstream tasks, define your own head"""
+        return
+
+    def encode_image_pairs(self, img1, img2, return_all_blocks=False):
+        """run encoder for a pair of images
+        it is actually ~5% faster to concatenate the images along the batch dimension
+         than to encode them separately
+        """
+        ## the two commented lines below is the naive version with separate encoding
+        # out, pos, _ = self._encode_image(img1, do_mask=False, return_all_blocks=return_all_blocks)
+        # out2, pos2, _ = self._encode_image(img2, do_mask=False, return_all_blocks=False)
+        ## and now the faster version
+        out, pos, _ = self._encode_image(
+            torch.cat((img1, img2), dim=0),
+            do_mask=False,
+            return_all_blocks=return_all_blocks,
+        )
+        if return_all_blocks:
+            out, out2 = list(map(list, zip(*[o.chunk(2, dim=0) for o in out])))
+            out2 = out2[-1]
+        else:
+            out, out2 = out.chunk(2, dim=0)
+        pos, pos2 = pos.chunk(2, dim=0)
+        return out, out2, pos, pos2
+
+    def forward(self, img1, img2):
+        B, C, H, W = img1.size()
+        img_info = {"height": H, "width": W}
+        return_all_blocks = (
+            hasattr(self.head, "return_all_blocks") and self.head.return_all_blocks
+        )
+        out, out2, pos, pos2 = self.encode_image_pairs(
+            img1, img2, return_all_blocks=return_all_blocks
+        )
+        if return_all_blocks:
+            decout = self._decoder(
+                out[-1], pos, None, out2, pos2, return_all_blocks=return_all_blocks
+            )
+            decout = out + decout
+        else:
+            decout = self._decoder(
+                out, pos, None, out2, pos2, return_all_blocks=return_all_blocks
+            )
+        return self.head(decout, img_info)
diff --git a/extern/CUT3R/src/croco/models/curope/__init__.py b/extern/CUT3R/src/croco/models/curope/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..25e3d48a162760260826080f6366838e83e26878
--- /dev/null
+++ b/extern/CUT3R/src/croco/models/curope/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+from .curope2d import cuRoPE2D
diff --git a/extern/CUT3R/src/croco/models/curope/curope.cpp b/extern/CUT3R/src/croco/models/curope/curope.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8fe9058e05aa1bf3f37b0d970edc7312bc68455b
--- /dev/null
+++ b/extern/CUT3R/src/croco/models/curope/curope.cpp
@@ -0,0 +1,69 @@
+/* 
+  Copyright (C) 2022-present Naver Corporation. All rights reserved.
+  Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+*/
+
+#include <torch/extension.h>
+
+// forward declaration
+void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd );
+
+void rope_2d_cpu( torch::Tensor tokens, const torch::Tensor positions, const float base, const float fwd )
+{
+    const int B = tokens.size(0);
+    const int N = tokens.size(1);
+    const int H = tokens.size(2);
+    const int D = tokens.size(3) / 4;
+
+    auto tok = tokens.accessor<float, 4>();
+    auto pos = positions.accessor<int64_t, 3>();
+
+    for (int b = 0; b < B; b++) {
+      for (int x = 0; x < 2; x++) { // y and then x (2d)
+        for (int n = 0; n < N; n++) {
+        
+            // grab the token position
+            const int p = pos[b][n][x];
+
+            for (int h = 0; h < H; h++) {
+                for (int d = 0; d < D; d++) {
+                    // grab the two values
+                    float u = tok[b][n][h][d+0+x*2*D];
+                    float v = tok[b][n][h][d+D+x*2*D];
+
+                    // grab the cos,sin
+                    const float inv_freq = fwd * p / powf(base, d/float(D));
+                    float c = cosf(inv_freq);
+                    float s = sinf(inv_freq);
+
+                    // write the result
+                    tok[b][n][h][d+0+x*2*D] = u*c - v*s;
+                    tok[b][n][h][d+D+x*2*D] = v*c + u*s;
+                }
+            }
+        }
+      }
+    }
+}
+
+void rope_2d( torch::Tensor tokens,     // B,N,H,D
+        const torch::Tensor positions,  // B,N,2
+        const float base, 
+        const float fwd )
+{
+    TORCH_CHECK(tokens.dim() == 4, "tokens must have 4 dimensions");
+    TORCH_CHECK(positions.dim() == 3, "positions must have 3 dimensions");
+    TORCH_CHECK(tokens.size(0) == positions.size(0), "batch size differs between tokens & positions");
+    TORCH_CHECK(tokens.size(1) == positions.size(1), "seq_length differs between tokens & positions");
+    TORCH_CHECK(positions.size(2) == 2, "positions.shape[2] must be equal to 2");
+    TORCH_CHECK(tokens.is_cuda() == positions.is_cuda(), "tokens and positions are not on the same device" );
+
+    if (tokens.is_cuda())
+        rope_2d_cuda( tokens, positions, base, fwd );
+    else
+        rope_2d_cpu( tokens, positions, base, fwd );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("rope_2d", &rope_2d, "RoPE 2d forward/backward");
+}
diff --git a/extern/CUT3R/src/croco/models/curope/curope2d.py b/extern/CUT3R/src/croco/models/curope/curope2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e0345c31bd3925be91dde5b9cfc64432f7bf516
--- /dev/null
+++ b/extern/CUT3R/src/croco/models/curope/curope2d.py
@@ -0,0 +1,40 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import torch
+
+try:
+    import curope as _kernels  # run `python setup.py install`
+except ModuleNotFoundError:
+    from . import curope as _kernels  # run `python setup.py build_ext --inplace`
+
+
+class cuRoPE2D_func(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, tokens, positions, base, F0=1):
+        ctx.save_for_backward(positions)
+        ctx.saved_base = base
+        ctx.saved_F0 = F0
+        # tokens = tokens.clone() # uncomment this if inplace doesn't work
+        _kernels.rope_2d(tokens, positions, base, F0)
+        ctx.mark_dirty(tokens)
+        return tokens
+
+    @staticmethod
+    def backward(ctx, grad_res):
+        positions, base, F0 = ctx.saved_tensors[0], ctx.saved_base, ctx.saved_F0
+        _kernels.rope_2d(grad_res, positions, base, -F0)
+        ctx.mark_dirty(grad_res)
+        return grad_res, None, None, None
+
+
+class cuRoPE2D(torch.nn.Module):
+    def __init__(self, freq=100.0, F0=1.0):
+        super().__init__()
+        self.base = freq
+        self.F0 = F0
+
+    def forward(self, tokens, positions):
+        cuRoPE2D_func.apply(tokens.transpose(1, 2), positions, self.base, self.F0)
+        return tokens
diff --git a/extern/CUT3R/src/croco/models/curope/kernels.cu b/extern/CUT3R/src/croco/models/curope/kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bf777c25d7dd9fd3c70c25e0a7623799bc1434f5
--- /dev/null
+++ b/extern/CUT3R/src/croco/models/curope/kernels.cu
@@ -0,0 +1,108 @@
+/* 
+  Copyright (C) 2022-present Naver Corporation. All rights reserved.
+  Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+*/
+
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+
+#define CHECK_CUDA(tensor) {\
+    TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \
+    TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); }
+void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));}
+
+
+template < typename scalar_t  >
+__global__ void rope_2d_cuda_kernel( 
+        //scalar_t* __restrict__ tokens, 
+        torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> tokens,
+        const int64_t* __restrict__ pos, 
+        const float base, 
+        const float fwd )
+        // const int N, const int H, const int D )
+{
+    // tokens shape = (B, N, H, D)
+    const int N = tokens.size(1);
+    const int H = tokens.size(2);
+    const int D = tokens.size(3);
+    
+    // each block update a single token, for all heads
+    // each thread takes care of a single output
+    extern __shared__ float shared[];
+    float* shared_inv_freq = shared + D;
+
+    const int b = blockIdx.x / N;
+    const int n = blockIdx.x % N;
+
+    const int Q = D / 4; 
+    // one token = [0..Q : Q..2Q : 2Q..3Q : 3Q..D]
+    //              u_Y     v_Y     u_X      v_X
+
+    // shared memory: first, compute inv_freq
+    if (threadIdx.x < Q)
+        shared_inv_freq[threadIdx.x] = fwd / powf(base, threadIdx.x/float(Q));
+    __syncthreads();
+
+    // start of X or Y part
+    const int X = threadIdx.x < D/2 ? 0 : 1; 
+    const int m = (X*D/2) + (threadIdx.x % Q);   // index of u_Y or u_X
+
+    // grab the cos,sin appropriate for me
+    const float freq = pos[blockIdx.x*2+X] * shared_inv_freq[threadIdx.x % Q];
+    const float cos = cosf(freq);
+    const float sin = sinf(freq);
+    /*
+    float* shared_cos_sin = shared + D + D/4;
+    if ((threadIdx.x % (D/2)) < Q)
+        shared_cos_sin[m+0] = cosf(freq);
+    else
+        shared_cos_sin[m+Q] = sinf(freq);
+    __syncthreads();
+    const float cos = shared_cos_sin[m+0];
+    const float sin = shared_cos_sin[m+Q];
+    */
+
+    for (int h = 0; h < H; h++)
+    {
+        // then, load all the token for this head in shared memory
+        shared[threadIdx.x] = tokens[b][n][h][threadIdx.x];
+        __syncthreads();
+
+        const float u = shared[m];
+        const float v = shared[m+Q];
+        
+        // write output
+        if ((threadIdx.x % (D/2)) < Q)
+            tokens[b][n][h][threadIdx.x] = u*cos - v*sin;
+        else
+            tokens[b][n][h][threadIdx.x] = v*cos + u*sin;
+    }
+}
+
+void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd ) 
+{
+    const int B = tokens.size(0); // batch size
+    const int N = tokens.size(1); // sequence length
+    const int H = tokens.size(2); // number of heads
+    const int D = tokens.size(3); // dimension per head
+
+    TORCH_CHECK(tokens.stride(3) == 1 && tokens.stride(2) == D, "tokens are not contiguous");
+    TORCH_CHECK(pos.is_contiguous(), "positions are not contiguous");
+    TORCH_CHECK(pos.size(0) == B && pos.size(1) == N && pos.size(2) == 2, "bad pos.shape");
+    TORCH_CHECK(D % 4 == 0, "token dim must be multiple of 4");
+
+    // one block for each layer, one thread per local-max
+    const int THREADS_PER_BLOCK = D;
+    const int N_BLOCKS = B * N; // each block takes care of H*D values
+    const int SHARED_MEM = sizeof(float) * (D + D/4);
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.scalar_type(), "rope_2d_cuda", ([&] {
+        rope_2d_cuda_kernel<scalar_t> <<<N_BLOCKS, THREADS_PER_BLOCK, SHARED_MEM>>> (
+            //tokens.data_ptr<scalar_t>(), 
+            tokens.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
+            pos.data_ptr<int64_t>(), 
+            base, fwd); //, N, H, D );
+    }));
+}
diff --git a/extern/CUT3R/src/croco/models/curope/setup.py b/extern/CUT3R/src/croco/models/curope/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..02ddb0912370a67a49fd2bb91164cf2f1da8648e
--- /dev/null
+++ b/extern/CUT3R/src/croco/models/curope/setup.py
@@ -0,0 +1,34 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+from setuptools import setup
+from torch import cuda
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+# compile for all possible CUDA architectures
+all_cuda_archs = cuda.get_gencode_flags().replace("compute=", "arch=").split()
+# alternatively, you can list cuda archs that you want, eg:
+# all_cuda_archs = [
+# '-gencode', 'arch=compute_70,code=sm_70',
+# '-gencode', 'arch=compute_75,code=sm_75',
+# '-gencode', 'arch=compute_80,code=sm_80',
+# '-gencode', 'arch=compute_86,code=sm_86'
+# ]
+
+setup(
+    name="curope",
+    ext_modules=[
+        CUDAExtension(
+            name="curope",
+            sources=[
+                "curope.cpp",
+                "kernels.cu",
+            ],
+            extra_compile_args=dict(
+                nvcc=["-O3", "--ptxas-options=-v", "--use_fast_math"] + all_cuda_archs,
+                cxx=["-O3"],
+            ),
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)
diff --git a/extern/CUT3R/src/croco/models/dpt_block.py b/extern/CUT3R/src/croco/models/dpt_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..b470d91c9c86af8f3b3947e3abcf96d49ab3e06d
--- /dev/null
+++ b/extern/CUT3R/src/croco/models/dpt_block.py
@@ -0,0 +1,513 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# DPT head for ViTs
+# --------------------------------------------------------
+# References:
+# https://github.com/isl-org/DPT
+# https://github.com/EPFL-VILAB/MultiMAE/blob/main/multimae/output_adapters.py
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from typing import Union, Tuple, Iterable, List, Optional, Dict
+
+
+def pair(t):
+    return t if isinstance(t, tuple) else (t, t)
+
+
+def make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand == True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0],
+        out_shape1,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1],
+        out_shape2,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2],
+        out_shape3,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3],
+        out_shape4,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+
+    scratch.layer_rn = nn.ModuleList(
+        [
+            scratch.layer1_rn,
+            scratch.layer2_rn,
+            scratch.layer3_rn,
+            scratch.layer4_rn,
+        ]
+    )
+
+    return scratch
+
+
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module."""
+
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.bn = bn
+
+        self.groups = 1
+
+        self.conv1 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+
+        self.conv2 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+
+        self.activation = activation
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn == True:
+            out = self.bn1(out)
+
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn == True:
+            out = self.bn2(out)
+
+        if self.groups > 1:
+            out = self.conv_merge(out)
+
+        return self.skip_add.add(out, x)
+
+
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block."""
+
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        width_ratio=1,
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+        self.width_ratio = width_ratio
+
+        self.deconv = deconv
+        self.align_corners = align_corners
+
+        self.groups = 1
+
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+
+        self.out_conv = nn.Conv2d(
+            features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+            groups=1,
+        )
+
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            if self.width_ratio != 1:
+                res = F.interpolate(
+                    res, size=(output.shape[2], output.shape[3]), mode="bilinear"
+                )
+
+            output = self.skip_add.add(output, res)
+            # output += res
+
+        output = self.resConfUnit2(output)
+
+        if self.width_ratio != 1:
+            # and output.shape[3] < self.width_ratio * output.shape[2]
+            # size=(image.shape[])
+            if (output.shape[3] / output.shape[2]) < (2 / 3) * self.width_ratio:
+                shape = 3 * output.shape[3]
+            else:
+                shape = int(self.width_ratio * 2 * output.shape[2])
+            output = F.interpolate(
+                output, size=(2 * output.shape[2], shape), mode="bilinear"
+            )
+        else:
+            output = nn.functional.interpolate(
+                output,
+                scale_factor=2,
+                mode="bilinear",
+                align_corners=self.align_corners,
+            )
+        output = self.out_conv(output)
+        return output
+
+
+def make_fusion_block(features, use_bn, width_ratio=1):
+    return FeatureFusionBlock_custom(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        width_ratio=width_ratio,
+    )
+
+
+class Interpolate(nn.Module):
+    """Interpolation module."""
+
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: interpolated data
+        """
+
+        x = self.interp(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+        )
+
+        return x
+
+
+class DPTOutputAdapter(nn.Module):
+    """DPT output adapter.
+
+    :param num_cahnnels: Number of output channels
+    :param stride_level: tride level compared to the full-sized image.
+        E.g. 4 for 1/4th the size of the image.
+    :param patch_size_full: Int or tuple of the patch size over the full image size.
+        Patch size for smaller inputs will be computed accordingly.
+    :param hooks: Index of intermediate layers
+    :param layer_dims: Dimension of intermediate layers
+    :param feature_dim: Feature dimension
+    :param last_dim: out_channels/in_channels for the last two Conv2d when head_type == regression
+    :param use_bn: If set to True, activates batch norm
+    :param dim_tokens_enc:  Dimension of tokens coming from encoder
+    """
+
+    def __init__(
+        self,
+        num_channels: int = 1,
+        stride_level: int = 1,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        main_tasks: Iterable[str] = ("rgb",),
+        hooks: List[int] = [2, 5, 8, 11],
+        layer_dims: List[int] = [96, 192, 384, 768],
+        feature_dim: int = 256,
+        last_dim: int = 32,
+        use_bn: bool = False,
+        dim_tokens_enc: Optional[int] = None,
+        head_type: str = "regression",
+        output_width_ratio=1,
+        **kwargs
+    ):
+        super().__init__()
+        self.num_channels = num_channels
+        self.stride_level = stride_level
+        self.patch_size = pair(patch_size)
+        self.main_tasks = main_tasks
+        self.hooks = hooks
+        self.layer_dims = layer_dims
+        self.feature_dim = feature_dim
+        self.dim_tokens_enc = (
+            dim_tokens_enc * len(self.main_tasks)
+            if dim_tokens_enc is not None
+            else None
+        )
+        self.head_type = head_type
+
+        # Actual patch height and width, taking into account stride of input
+        self.P_H = max(1, self.patch_size[0] // stride_level)
+        self.P_W = max(1, self.patch_size[1] // stride_level)
+
+        self.scratch = make_scratch(layer_dims, feature_dim, groups=1, expand=False)
+
+        self.scratch.refinenet1 = make_fusion_block(
+            feature_dim, use_bn, output_width_ratio
+        )
+        self.scratch.refinenet2 = make_fusion_block(
+            feature_dim, use_bn, output_width_ratio
+        )
+        self.scratch.refinenet3 = make_fusion_block(
+            feature_dim, use_bn, output_width_ratio
+        )
+        self.scratch.refinenet4 = make_fusion_block(
+            feature_dim, use_bn, output_width_ratio
+        )
+
+        if self.head_type == "regression":
+            # The "DPTDepthModel" head
+            self.head = nn.Sequential(
+                nn.Conv2d(
+                    feature_dim, feature_dim // 2, kernel_size=3, stride=1, padding=1
+                ),
+                Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+                nn.Conv2d(
+                    feature_dim // 2, last_dim, kernel_size=3, stride=1, padding=1
+                ),
+                nn.ReLU(True),
+                nn.Conv2d(
+                    last_dim, self.num_channels, kernel_size=1, stride=1, padding=0
+                ),
+            )
+        elif self.head_type == "semseg":
+            # The "DPTSegmentationModel" head
+            self.head = nn.Sequential(
+                nn.Conv2d(
+                    feature_dim, feature_dim, kernel_size=3, padding=1, bias=False
+                ),
+                nn.BatchNorm2d(feature_dim) if use_bn else nn.Identity(),
+                nn.ReLU(True),
+                nn.Dropout(0.1, False),
+                nn.Conv2d(feature_dim, self.num_channels, kernel_size=1),
+                Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+            )
+        else:
+            raise ValueError('DPT head_type must be "regression" or "semseg".')
+
+        if self.dim_tokens_enc is not None:
+            self.init(dim_tokens_enc=dim_tokens_enc)
+
+    def init(self, dim_tokens_enc=768):
+        """
+        Initialize parts of decoder that are dependent on dimension of encoder tokens.
+        Should be called when setting up MultiMAE.
+
+        :param dim_tokens_enc: Dimension of tokens coming from encoder
+        """
+        # print(dim_tokens_enc)
+
+        # Set up activation postprocessing layers
+        if isinstance(dim_tokens_enc, int):
+            dim_tokens_enc = 4 * [dim_tokens_enc]
+
+        self.dim_tokens_enc = [dt * len(self.main_tasks) for dt in dim_tokens_enc]
+
+        self.act_1_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[0],
+                out_channels=self.layer_dims[0],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=self.layer_dims[0],
+                out_channels=self.layer_dims[0],
+                kernel_size=4,
+                stride=4,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+
+        self.act_2_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[1],
+                out_channels=self.layer_dims[1],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=self.layer_dims[1],
+                out_channels=self.layer_dims[1],
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+
+        self.act_3_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[2],
+                out_channels=self.layer_dims[2],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+        )
+
+        self.act_4_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[3],
+                out_channels=self.layer_dims[3],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.Conv2d(
+                in_channels=self.layer_dims[3],
+                out_channels=self.layer_dims[3],
+                kernel_size=3,
+                stride=2,
+                padding=1,
+            ),
+        )
+
+        self.act_postprocess = nn.ModuleList(
+            [
+                self.act_1_postprocess,
+                self.act_2_postprocess,
+                self.act_3_postprocess,
+                self.act_4_postprocess,
+            ]
+        )
+
+    def adapt_tokens(self, encoder_tokens):
+        # Adapt tokens
+        x = []
+        x.append(encoder_tokens[:, :])
+        x = torch.cat(x, dim=-1)
+        return x
+
+    def forward(self, encoder_tokens: List[torch.Tensor], image_size):
+        # input_info: Dict):
+        assert (
+            self.dim_tokens_enc is not None
+        ), "Need to call init(dim_tokens_enc) function first"
+        H, W = image_size
+
+        # Number of patches in height and width
+        N_H = H // (self.stride_level * self.P_H)
+        N_W = W // (self.stride_level * self.P_W)
+
+        # Hook decoder onto 4 layers from specified ViT layers
+        layers = [encoder_tokens[hook] for hook in self.hooks]
+
+        # Extract only task-relevant tokens and ignore global tokens.
+        layers = [self.adapt_tokens(l) for l in layers]
+
+        # Reshape tokens to spatial representation
+        layers = [
+            rearrange(l, "b (nh nw) c -> b c nh nw", nh=N_H, nw=N_W) for l in layers
+        ]
+
+        layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)]
+        # Project layers to chosen feature dim
+        layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)]
+
+        # Fuse layers using refinement stages
+        path_4 = self.scratch.refinenet4(layers[3])
+        path_3 = self.scratch.refinenet3(path_4, layers[2])
+        path_2 = self.scratch.refinenet2(path_3, layers[1])
+        path_1 = self.scratch.refinenet1(path_2, layers[0])
+
+        # Output head
+        out = self.head(path_1)
+
+        return out
diff --git a/extern/CUT3R/src/croco/models/head_downstream.py b/extern/CUT3R/src/croco/models/head_downstream.py
new file mode 100644
index 0000000000000000000000000000000000000000..384afcbd6ac9d4b5729c0219dd8534b5123d2b17
--- /dev/null
+++ b/extern/CUT3R/src/croco/models/head_downstream.py
@@ -0,0 +1,83 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Heads for downstream tasks
+# --------------------------------------------------------
+
+"""
+A head is a module where the __init__ defines only the head hyperparameters.
+A method setup(croconet) takes a CroCoNet and set all layers according to the head and croconet attributes.
+The forward takes the features as well as a dictionary img_info containing the keys 'width' and 'height'
+"""
+
+import torch
+import torch.nn as nn
+from .dpt_block import DPTOutputAdapter
+
+
+class PixelwiseTaskWithDPT(nn.Module):
+    """DPT module for CroCo.
+    by default, hooks_idx will be equal to:
+    * for encoder-only: 4 equally spread layers
+    * for encoder+decoder: last encoder + 3 equally spread layers of the decoder
+    """
+
+    def __init__(
+        self,
+        *,
+        hooks_idx=None,
+        layer_dims=[96, 192, 384, 768],
+        output_width_ratio=1,
+        num_channels=1,
+        postprocess=None,
+        **kwargs,
+    ):
+        super(PixelwiseTaskWithDPT, self).__init__()
+        self.return_all_blocks = True  # backbone needs to return all layers
+        self.postprocess = postprocess
+        self.output_width_ratio = output_width_ratio
+        self.num_channels = num_channels
+        self.hooks_idx = hooks_idx
+        self.layer_dims = layer_dims
+
+    def setup(self, croconet):
+        dpt_args = {
+            "output_width_ratio": self.output_width_ratio,
+            "num_channels": self.num_channels,
+        }
+        if self.hooks_idx is None:
+            if hasattr(croconet, "dec_blocks"):  # encoder + decoder
+                step = {8: 3, 12: 4, 24: 8}[croconet.dec_depth]
+                hooks_idx = [
+                    croconet.dec_depth + croconet.enc_depth - 1 - i * step
+                    for i in range(3, -1, -1)
+                ]
+            else:  # encoder only
+                step = croconet.enc_depth // 4
+                hooks_idx = [
+                    croconet.enc_depth - 1 - i * step for i in range(3, -1, -1)
+                ]
+            self.hooks_idx = hooks_idx
+            print(
+                f"  PixelwiseTaskWithDPT: automatically setting hook_idxs={self.hooks_idx}"
+            )
+        dpt_args["hooks"] = self.hooks_idx
+        dpt_args["layer_dims"] = self.layer_dims
+        self.dpt = DPTOutputAdapter(**dpt_args)
+        dim_tokens = [
+            (
+                croconet.enc_embed_dim
+                if hook < croconet.enc_depth
+                else croconet.dec_embed_dim
+            )
+            for hook in self.hooks_idx
+        ]
+        dpt_init_args = {"dim_tokens_enc": dim_tokens}
+        self.dpt.init(**dpt_init_args)
+
+    def forward(self, x, img_info):
+        out = self.dpt(x, image_size=(img_info["height"], img_info["width"]))
+        if self.postprocess:
+            out = self.postprocess(out)
+        return out
diff --git a/extern/CUT3R/src/croco/models/masking.py b/extern/CUT3R/src/croco/models/masking.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae18f927ae82e4075c2246ce722007c69a4da344
--- /dev/null
+++ b/extern/CUT3R/src/croco/models/masking.py
@@ -0,0 +1,26 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# Masking utils
+# --------------------------------------------------------
+
+import torch
+import torch.nn as nn
+
+
+class RandomMask(nn.Module):
+    """
+    random masking
+    """
+
+    def __init__(self, num_patches, mask_ratio):
+        super().__init__()
+        self.num_patches = num_patches
+        self.num_mask = int(mask_ratio * self.num_patches)
+
+    def __call__(self, x):
+        noise = torch.rand(x.size(0), self.num_patches, device=x.device)
+        argsort = torch.argsort(noise, dim=1)
+        return argsort < self.num_mask
diff --git a/extern/CUT3R/src/croco/models/pos_embed.py b/extern/CUT3R/src/croco/models/pos_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..97bf1323bc9113dd22cd7e32ae4f47e8899460b4
--- /dev/null
+++ b/extern/CUT3R/src/croco/models/pos_embed.py
@@ -0,0 +1,181 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# Position embedding utils
+# --------------------------------------------------------
+
+
+import numpy as np
+
+import torch
+
+
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, n_cls_token=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [n_cls_token+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if n_cls_token > 0:
+        pos_embed = np.concatenate(
+            [np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0
+        )
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    if "pos_embed" in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model["pos_embed"]
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches**0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print(
+                "Position interpolate from %dx%d to %dx%d"
+                % (orig_size, orig_size, new_size, new_size)
+            )
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(
+                -1, orig_size, orig_size, embedding_size
+            ).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens,
+                size=(new_size, new_size),
+                mode="bicubic",
+                align_corners=False,
+            )
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model["pos_embed"] = new_pos_embed
+
+
+# ----------------------------------------------------------
+# RoPE2D: RoPE implementation in 2D
+# ----------------------------------------------------------
+
+# Directly use PyTorch implementation due to CUDA compatibility issues
+try:
+    from models.curope import cuRoPE2D
+    RoPE2D = cuRoPE2D
+    print("Using CUDA-compiled version of RoPE2D")
+except ImportError:
+    print(
+        "Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead"
+    )
+
+
+# class RoPE2D(torch.nn.Module):
+
+#     def __init__(self, freq=100.0, F0=1.0):
+#         super().__init__()
+#         self.base = freq
+#         self.F0 = F0
+#         self.cache = {}
+
+#     def get_cos_sin(self, D, seq_len, device, dtype):
+#         if (D, seq_len, device, dtype) not in self.cache:
+#             inv_freq = 1.0 / (
+#                 self.base ** (torch.arange(0, D, 2).float().to(device) / D)
+#             )
+#             t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+#             freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+#             freqs = torch.cat((freqs, freqs), dim=-1)
+#             cos = freqs.cos()  # (Seq, Dim)
+#             sin = freqs.sin()
+#             self.cache[D, seq_len, device, dtype] = (cos, sin)
+#         return self.cache[D, seq_len, device, dtype]
+
+#     @staticmethod
+#     def rotate_half(x):
+#         x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+#         return torch.cat((-x2, x1), dim=-1)
+
+#     def apply_rope1d(self, tokens, pos1d, cos, sin):
+#         assert pos1d.ndim == 2
+#         cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+#         sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+#         return (tokens * cos) + (self.rotate_half(tokens) * sin)
+
+#     def forward(self, tokens, positions):
+#         """
+#         input:
+#             * tokens: batch_size x nheads x ntokens x dim
+#             * positions: batch_size x ntokens x 2 (y and x position of each token)
+#         output:
+#             * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim)
+#         """
+#         assert (
+#             tokens.size(3) % 2 == 0
+#         ), "number of dimensions should be a multiple of two"
+#         D = tokens.size(3) // 2
+#         assert positions.ndim == 3 and positions.shape[-1] == 2  # Batch, Seq, 2
+#         cos, sin = self.get_cos_sin(
+#             D, int(positions.max()) + 1, tokens.device, tokens.dtype
+#         )
+#         # split features into two along the feature dimension, and apply rope1d on each half
+#         y, x = tokens.chunk(2, dim=-1)
+#         y = self.apply_rope1d(y, positions[:, :, 0], cos, sin)
+#         x = self.apply_rope1d(x, positions[:, :, 1], cos, sin)
+#         tokens = torch.cat((y, x), dim=-1)
+#         return tokens
diff --git a/extern/CUT3R/src/croco/pretrain.py b/extern/CUT3R/src/croco/pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..fef4ff2a0b7cb865a68741ac0e76d43d50ee4659
--- /dev/null
+++ b/extern/CUT3R/src/croco/pretrain.py
@@ -0,0 +1,391 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Pre-training CroCo
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+import argparse
+import datetime
+import json
+import numpy as np
+import os
+import sys
+import time
+import math
+from pathlib import Path
+from typing import Iterable
+
+import torch
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+import utils.misc as misc
+from utils.misc import NativeScalerWithGradNormCount as NativeScaler
+from models.croco import CroCoNet
+from models.criterion import MaskedMSE
+from datasets.pairs_dataset import PairsDataset
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser("CroCo pre-training", add_help=False)
+    # model and criterion
+    parser.add_argument(
+        "--model",
+        default="CroCoNet()",
+        type=str,
+        help="string containing the model to build",
+    )
+    parser.add_argument(
+        "--norm_pix_loss",
+        default=1,
+        choices=[0, 1],
+        help="apply per-patch mean/std normalization before applying the loss",
+    )
+    # dataset
+    parser.add_argument(
+        "--dataset", default="habitat_release", type=str, help="training set"
+    )
+    parser.add_argument(
+        "--transforms", default="crop224+acolor", type=str, help="transforms to apply"
+    )  # in the paper, we also use some homography and rotation, but find later that they were not useful or even harmful
+    # training
+    parser.add_argument("--seed", default=0, type=int, help="Random seed")
+    parser.add_argument(
+        "--batch_size",
+        default=64,
+        type=int,
+        help="Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus",
+    )
+    parser.add_argument(
+        "--epochs",
+        default=800,
+        type=int,
+        help="Maximum number of epochs for the scheduler",
+    )
+    parser.add_argument(
+        "--max_epoch", default=400, type=int, help="Stop training at this epoch"
+    )
+    parser.add_argument(
+        "--accum_iter",
+        default=1,
+        type=int,
+        help="Accumulate gradient iterations (for increasing the effective batch size under memory constraints)",
+    )
+    parser.add_argument(
+        "--weight_decay", type=float, default=0.05, help="weight decay (default: 0.05)"
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=None,
+        metavar="LR",
+        help="learning rate (absolute lr)",
+    )
+    parser.add_argument(
+        "--blr",
+        type=float,
+        default=1.5e-4,
+        metavar="LR",
+        help="base learning rate: absolute_lr = base_lr * total_batch_size / 256",
+    )
+    parser.add_argument(
+        "--min_lr",
+        type=float,
+        default=0.0,
+        metavar="LR",
+        help="lower lr bound for cyclic schedulers that hit 0",
+    )
+    parser.add_argument(
+        "--warmup_epochs", type=int, default=40, metavar="N", help="epochs to warmup LR"
+    )
+    parser.add_argument(
+        "--amp",
+        type=int,
+        default=1,
+        choices=[0, 1],
+        help="Use Automatic Mixed Precision for pretraining",
+    )
+    # others
+    parser.add_argument("--num_workers", default=8, type=int)
+    parser.add_argument(
+        "--world_size", default=1, type=int, help="number of distributed processes"
+    )
+    parser.add_argument("--local_rank", default=-1, type=int)
+    parser.add_argument(
+        "--dist_url", default="env://", help="url used to set up distributed training"
+    )
+    parser.add_argument(
+        "--save_freq",
+        default=1,
+        type=int,
+        help="frequence (number of epochs) to save checkpoint in checkpoint-last.pth",
+    )
+    parser.add_argument(
+        "--keep_freq",
+        default=20,
+        type=int,
+        help="frequence (number of epochs) to save checkpoint in checkpoint-%d.pth",
+    )
+    parser.add_argument(
+        "--print_freq",
+        default=20,
+        type=int,
+        help="frequence (number of iterations) to print infos while training",
+    )
+    # paths
+    parser.add_argument(
+        "--output_dir",
+        default="./output/",
+        type=str,
+        help="path where to save the output",
+    )
+    parser.add_argument(
+        "--data_dir", default="./data/", type=str, help="path where data are stored"
+    )
+    return parser
+
+
+def main(args):
+    misc.init_distributed_mode(args)
+    global_rank = misc.get_rank()
+    world_size = misc.get_world_size()
+
+    print("output_dir: " + args.output_dir)
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+
+    # auto resume
+    last_ckpt_fname = os.path.join(args.output_dir, f"checkpoint-last.pth")
+    args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None
+
+    print("job dir: {}".format(os.path.dirname(os.path.realpath(__file__))))
+    print("{}".format(args).replace(", ", ",\n"))
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+
+    # fix the seed
+    seed = args.seed + misc.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+    cudnn.benchmark = True
+
+    ## training dataset and loader
+    print(
+        "Building dataset for {:s} with transforms {:s}".format(
+            args.dataset, args.transforms
+        )
+    )
+    dataset = PairsDataset(args.dataset, trfs=args.transforms, data_dir=args.data_dir)
+    if world_size > 1:
+        sampler_train = torch.utils.data.DistributedSampler(
+            dataset, num_replicas=world_size, rank=global_rank, shuffle=True
+        )
+        print("Sampler_train = %s" % str(sampler_train))
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset)
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler_train,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=True,
+    )
+
+    ## model
+    print("Loading model: {:s}".format(args.model))
+    model = eval(args.model)
+    print(
+        "Loading criterion: MaskedMSE(norm_pix_loss={:s})".format(
+            str(bool(args.norm_pix_loss))
+        )
+    )
+    criterion = MaskedMSE(norm_pix_loss=bool(args.norm_pix_loss))
+
+    model.to(device)
+    model_without_ddp = model
+    print("Model = %s" % str(model_without_ddp))
+
+    eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
+    if args.lr is None:  # only base_lr is specified
+        args.lr = args.blr * eff_batch_size / 256
+    print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
+    print("actual lr: %.2e" % args.lr)
+    print("accumulate grad iterations: %d" % args.accum_iter)
+    print("effective batch size: %d" % eff_batch_size)
+
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.gpu], find_unused_parameters=True, static_graph=True
+        )
+        model_without_ddp = model.module
+
+    param_groups = misc.get_parameter_groups(
+        model_without_ddp, args.weight_decay
+    )  # following timm: set wd as 0 for bias and norm layers
+    optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
+    print(optimizer)
+    loss_scaler = NativeScaler()
+
+    misc.load_model(
+        args=args,
+        model_without_ddp=model_without_ddp,
+        optimizer=optimizer,
+        loss_scaler=loss_scaler,
+    )
+
+    if global_rank == 0 and args.output_dir is not None:
+        log_writer = SummaryWriter(log_dir=args.output_dir)
+    else:
+        log_writer = None
+
+    print(f"Start training until {args.max_epoch} epochs")
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.max_epoch):
+        if world_size > 1:
+            data_loader_train.sampler.set_epoch(epoch)
+
+        train_stats = train_one_epoch(
+            model,
+            criterion,
+            data_loader_train,
+            optimizer,
+            device,
+            epoch,
+            loss_scaler,
+            log_writer=log_writer,
+            args=args,
+        )
+
+        if args.output_dir and epoch % args.save_freq == 0:
+            misc.save_model(
+                args=args,
+                model_without_ddp=model_without_ddp,
+                optimizer=optimizer,
+                loss_scaler=loss_scaler,
+                epoch=epoch,
+                fname="last",
+            )
+
+        if (
+            args.output_dir
+            and (epoch % args.keep_freq == 0 or epoch + 1 == args.max_epoch)
+            and (epoch > 0 or args.max_epoch == 1)
+        ):
+            misc.save_model(
+                args=args,
+                model_without_ddp=model_without_ddp,
+                optimizer=optimizer,
+                loss_scaler=loss_scaler,
+                epoch=epoch,
+            )
+
+        log_stats = {
+            **{f"train_{k}": v for k, v in train_stats.items()},
+            "epoch": epoch,
+        }
+
+        if args.output_dir and misc.is_main_process():
+            if log_writer is not None:
+                log_writer.flush()
+            with open(
+                os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8"
+            ) as f:
+                f.write(json.dumps(log_stats) + "\n")
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print("Training time {}".format(total_time_str))
+
+
+def train_one_epoch(
+    model: torch.nn.Module,
+    criterion: torch.nn.Module,
+    data_loader: Iterable,
+    optimizer: torch.optim.Optimizer,
+    device: torch.device,
+    epoch: int,
+    loss_scaler,
+    log_writer=None,
+    args=None,
+):
+    model.train(True)
+    metric_logger = misc.MetricLogger(delimiter="  ")
+    metric_logger.add_meter("lr", misc.SmoothedValue(window_size=1, fmt="{value:.6f}"))
+    header = "Epoch: [{}]".format(epoch)
+    accum_iter = args.accum_iter
+
+    optimizer.zero_grad()
+
+    if log_writer is not None:
+        print("log_dir: {}".format(log_writer.log_dir))
+
+    for data_iter_step, (image1, image2) in enumerate(
+        metric_logger.log_every(data_loader, args.print_freq, header)
+    ):
+
+        # we use a per iteration  lr scheduler
+        if data_iter_step % accum_iter == 0:
+            misc.adjust_learning_rate(
+                optimizer, data_iter_step / len(data_loader) + epoch, args
+            )
+
+        image1 = image1.to(device, non_blocking=True)
+        image2 = image2.to(device, non_blocking=True)
+        with torch.cuda.amp.autocast(enabled=bool(args.amp)):
+            out, mask, target = model(image1, image2)
+            loss = criterion(out, mask, target)
+
+        loss_value = loss.item()
+
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            sys.exit(1)
+
+        loss /= accum_iter
+        loss_scaler(
+            loss,
+            optimizer,
+            parameters=model.parameters(),
+            update_grad=(data_iter_step + 1) % accum_iter == 0,
+        )
+        if (data_iter_step + 1) % accum_iter == 0:
+            optimizer.zero_grad()
+
+        torch.cuda.synchronize()
+
+        metric_logger.update(loss=loss_value)
+
+        lr = optimizer.param_groups[0]["lr"]
+        metric_logger.update(lr=lr)
+
+        loss_value_reduce = misc.all_reduce_mean(loss_value)
+        if (
+            log_writer is not None
+            and ((data_iter_step + 1) % (accum_iter * args.print_freq)) == 0
+        ):
+            # x-axis is based on epoch_1000x in the tensorboard, calibrating differences curves when batch size changes
+            epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000)
+            log_writer.add_scalar("train_loss", loss_value_reduce, epoch_1000x)
+            log_writer.add_scalar("lr", lr, epoch_1000x)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+if __name__ == "__main__":
+    args = get_args_parser()
+    args = args.parse_args()
+    main(args)
diff --git a/extern/CUT3R/src/croco/stereoflow/README.MD b/extern/CUT3R/src/croco/stereoflow/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..81595380fadd274b523e0cf77921b1b65cbedb34
--- /dev/null
+++ b/extern/CUT3R/src/croco/stereoflow/README.MD
@@ -0,0 +1,318 @@
+## CroCo-Stereo and CroCo-Flow
+
+This README explains how to use CroCo-Stereo and CroCo-Flow as well as how they were trained.
+All commands should be launched from the root directory.
+
+### Simple inference example
+
+We provide a simple inference exemple for CroCo-Stereo and CroCo-Flow in the Totebook `croco-stereo-flow-demo.ipynb`.
+Before running it, please download the trained models with:
+```
+bash stereoflow/download_model.sh crocostereo.pth
+bash stereoflow/download_model.sh crocoflow.pth
+```
+
+### Prepare data for training or evaluation
+
+Put the datasets used for training/evaluation in `./data/stereoflow` (or update the paths at the top of `stereoflow/datasets_stereo.py` and `stereoflow/datasets_flow.py`).
+Please find below on the file structure should look for each dataset:
+<details>
+<summary>FlyingChairs</summary>
+
+```
+./data/stereoflow/FlyingChairs/
+└───chairs_split.txt
+└───data/
+    └─── ...
+```
+</details>
+
+<details>
+<summary>MPI-Sintel</summary>
+
+```
+./data/stereoflow/MPI-Sintel/
+└───training/
+│   └───clean/
+│   └───final/
+│   └───flow/
+└───test/
+    └───clean/
+    └───final/
+```
+</details>
+
+<details>
+<summary>SceneFlow (including FlyingThings)</summary>
+
+```
+./data/stereoflow/SceneFlow/
+└───Driving/
+│   └───disparity/
+│   └───frames_cleanpass/
+│   └───frames_finalpass/
+└───FlyingThings/
+│   └───disparity/
+│   └───frames_cleanpass/
+│   └───frames_finalpass/
+│   └───optical_flow/
+└───Monkaa/
+    └───disparity/
+    └───frames_cleanpass/
+    └───frames_finalpass/
+```
+</details>
+
+<details>
+<summary>TartanAir</summary>
+
+```
+./data/stereoflow/TartanAir/
+└───abandonedfactory/
+│   └───.../
+└───abandonedfactory_night/
+│   └───.../
+└───.../
+```
+</details>
+
+<details>
+<summary>Booster</summary>
+
+```
+./data/stereoflow/booster_gt/
+└───train/
+    └───balanced/
+        └───Bathroom/
+        └───Bedroom/
+        └───...
+```
+</details>
+
+<details>
+<summary>CREStereo</summary>
+
+```
+./data/stereoflow/crenet_stereo_trainset/
+└───stereo_trainset/
+    └───crestereo/
+        └───hole/
+        └───reflective/
+        └───shapenet/
+        └───tree/
+```
+</details>
+
+<details>
+<summary>ETH3D Two-view Low-res</summary>
+
+```
+./data/stereoflow/eth3d_lowres/
+└───test/
+│   └───lakeside_1l/
+│   └───...
+└───train/
+│   └───delivery_area_1l/
+│   └───...
+└───train_gt/
+    └───delivery_area_1l/
+    └───...
+```
+</details>
+
+<details>
+<summary>KITTI 2012</summary>
+
+```
+./data/stereoflow/kitti-stereo-2012/
+└───testing/
+│   └───colored_0/
+│   └───colored_1/
+└───training/
+    └───colored_0/
+    └───colored_1/
+    └───disp_occ/
+    └───flow_occ/
+```
+</details>
+
+<details>
+<summary>KITTI 2015</summary>
+
+```
+./data/stereoflow/kitti-stereo-2015/
+└───testing/
+│   └───image_2/
+│   └───image_3/
+└───training/
+    └───image_2/
+    └───image_3/
+    └───disp_occ_0/
+    └───flow_occ/
+```
+</details>
+
+<details>
+<summary>Middlebury</summary>
+
+```
+./data/stereoflow/middlebury
+└───2005/
+│   └───train/
+│       └───Art/
+│       └───...
+└───2006/
+│   └───Aloe/
+│   └───Baby1/
+│   └───...
+└───2014/
+│   └───Adirondack-imperfect/
+│   └───Adirondack-perfect/
+│   └───...
+└───2021/
+│   └───data/
+│       └───artroom1/
+│       └───artroom2/
+│       └───...
+└───MiddEval3_F/
+    └───test/
+    │   └───Australia/
+    │   └───...
+    └───train/
+        └───Adirondack/
+        └───...
+```
+</details>
+
+<details>
+<summary>Spring</summary>
+
+```
+./data/stereoflow/spring/
+└───test/
+│   └───0003/
+│   └───...
+└───train/
+    └───0001/
+    └───...
+```
+</details>
+
+
+### CroCo-Stereo
+
+##### Main model 
+
+The main training of CroCo-Stereo was performed on a series of datasets, and it was used as it for Middlebury v3 benchmark.
+
+```
+# Download the model
+bash stereoflow/download_model.sh crocostereo.pth
+# Middlebury v3 submission
+python stereoflow/test.py --model stereoflow_models/crocostereo.pth --dataset "MdEval3('all_full')" --save submission --tile_overlap 0.9
+# Training command that was used, using checkpoint-last.pth
+python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/
+# or it can be launched on multiple gpus (while maintaining the effective batch size), e.g. on 3 gpus:
+torchrun --nproc_per_node 3 stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 2 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/
+```
+
+For evaluation of validation set, we also provide the model trained on the `subtrain` subset of the training sets.
+
+```
+# Download the model
+bash stereoflow/download_model.sh crocostereo_subtrain.pth
+# Evaluation on validation sets 
+python stereoflow/test.py --model stereoflow_models/crocostereo_subtrain.pth --dataset "MdEval3('subval_full')+ETH3DLowRes('subval')+SceneFlow('test_finalpass')+SceneFlow('test_cleanpass')" --save metrics --tile_overlap 0.9
+# Training command that was used (same as above but on subtrain, using checkpoint-best.pth), can also be launched on multiple gpus
+python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('subtrain')+50*Md05('subtrain')+50*Md06('subtrain')+50*Md14('subtrain')+50*Md21('subtrain')+50*MdEval3('subtrain_full')+Booster('subtrain_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_subtrain/
+```
+
+##### Other models 
+
+<details>
+	<summary>Model for ETH3D</summary> 
+	The model used for the submission on ETH3D is trained with the same command but using an unbounded Laplacian loss.
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocostereo_eth3d.pth
+	# ETH3D submission
+	python stereoflow/test.py --model stereoflow_models/crocostereo_eth3d.pth --dataset "ETH3DLowRes('all')" --save submission --tile_overlap 0.9
+	# Training command that was used
+	python -u stereoflow/train.py stereo --criterion "LaplacianLoss()" --tile_conf_mode conf_expbeta3 --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_eth3d/
+	
+</details>
+
+<details>
+	<summary>Main model finetuned on Kitti</summary>
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocostereo_finetune_kitti.pth
+	# Kitti submission 
+	python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.9
+	# Training that was used
+	python -u stereoflow/train.py stereo --crop 352 1216 --criterion "LaplacianLossBounded2()" --dataset "Kitti12('train')+Kitti15('train')" --lr 3e-5 --batch_size 1 --accum_iter 6 --epochs 20 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_kitti/ --save_every 5
+</details>
+
+<details>
+	<summary>Main model finetuned on Spring</summary>
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocostereo_finetune_spring.pth
+	# Spring submission 
+	python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9
+	# Training command that was used
+	python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "Spring('train')" --lr 3e-5 --batch_size 6 --epochs 8 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_spring/
+</details>
+
+<details>
+	<summary>Smaller models</summary>
+	To train CroCo-Stereo with smaller CroCo pretrained models, simply replace the <code>--pretrained</code> argument. To download the smaller CroCo-Stereo models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use <code>bash stereoflow/download_model.sh crocostereo_subtrain_vitb_smalldecoder.pth</code>, and for the model with a ViT-Base encoder and a Base decoder, use <code>bash stereoflow/download_model.sh crocostereo_subtrain_vitb_basedecoder.pth</code>.
+</details>
+	
+
+### CroCo-Flow
+
+##### Main model
+
+The main training of CroCo-Flow was performed on the FlyingThings, FlyingChairs, MPI-Sintel and TartanAir datasets.
+It was used for our submission to the MPI-Sintel benchmark.
+
+```
+# Download the model 
+bash stereoflow/download_model.sh crocoflow.pth
+# Evaluation 
+python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --save metrics --tile_overlap 0.9
+# Sintel submission
+python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('test_allpass')" --save submission --tile_overlap 0.9
+# Training command that was used, with checkpoint-best.pth
+python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "40*MPISintel('subtrain_cleanpass')+40*MPISintel('subtrain_finalpass')+4*FlyingThings('train_allpass')+4*FlyingChairs('train')+TartanAir('train')" --val_dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --lr 2e-5 --batch_size 8 --epochs 240 --img_per_epoch 30000 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocoflow/main/
+```
+
+##### Other models 
+
+<details>
+	<summary>Main model finetuned on Kitti</summary>
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocoflow_finetune_kitti.pth
+	# Kitti submission 
+	python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.99
+	# Training that was used, with checkpoint-last.pth
+	python -u stereoflow/train.py flow --crop 352 1216 --criterion "LaplacianLossBounded()" --dataset "Kitti15('train')+Kitti12('train')" --lr 2e-5 --batch_size 1 --accum_iter 8 --epochs 150 --save_every 5 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_kitti/
+</details>
+
+<details>
+	<summary>Main model finetuned on Spring</summary>
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocoflow_finetune_spring.pth
+	# Spring submission 
+	python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9
+	# Training command that was used, with checkpoint-last.pth
+	python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "Spring('train')" --lr 2e-5 --batch_size 8 --epochs 12 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_spring/
+</details>
+
+<details>
+	<summary>Smaller models</summary>
+	To train CroCo-Flow with smaller CroCo pretrained models, simply replace the <code>--pretrained</code> argument. To download the smaller CroCo-Flow models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use <code>bash stereoflow/download_model.sh crocoflow_vitb_smalldecoder.pth</code>, and for the model with a ViT-Base encoder and a Base decoder, use <code>bash stereoflow/download_model.sh crocoflow_vitb_basedecoder.pth</code>.
+</details>
diff --git a/extern/CUT3R/src/croco/stereoflow/augmentor.py b/extern/CUT3R/src/croco/stereoflow/augmentor.py
new file mode 100644
index 0000000000000000000000000000000000000000..aac818df45d927ac383a41978ff92dc5f2899890
--- /dev/null
+++ b/extern/CUT3R/src/croco/stereoflow/augmentor.py
@@ -0,0 +1,396 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Data augmentation for training stereo and flow
+# --------------------------------------------------------
+
+# References
+# https://github.com/autonomousvision/unimatch/blob/master/dataloader/stereo/transforms.py
+# https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/transforms.py
+
+
+import numpy as np
+import random
+from PIL import Image
+
+import cv2
+
+cv2.setNumThreads(0)
+cv2.ocl.setUseOpenCL(False)
+
+import torch
+from torchvision.transforms import ColorJitter
+import torchvision.transforms.functional as FF
+
+
+class StereoAugmentor(object):
+
+    def __init__(
+        self,
+        crop_size,
+        scale_prob=0.5,
+        scale_xonly=True,
+        lhth=800.0,
+        lminscale=0.0,
+        lmaxscale=1.0,
+        hminscale=-0.2,
+        hmaxscale=0.4,
+        scale_interp_nearest=True,
+        rightjitterprob=0.5,
+        v_flip_prob=0.5,
+        color_aug_asym=True,
+        color_choice_prob=0.5,
+    ):
+        self.crop_size = crop_size
+        self.scale_prob = scale_prob
+        self.scale_xonly = scale_xonly
+        self.lhth = lhth
+        self.lminscale = lminscale
+        self.lmaxscale = lmaxscale
+        self.hminscale = hminscale
+        self.hmaxscale = hmaxscale
+        self.scale_interp_nearest = scale_interp_nearest
+        self.rightjitterprob = rightjitterprob
+        self.v_flip_prob = v_flip_prob
+        self.color_aug_asym = color_aug_asym
+        self.color_choice_prob = color_choice_prob
+
+    def _random_scale(self, img1, img2, disp):
+        ch, cw = self.crop_size
+        h, w = img1.shape[:2]
+        if self.scale_prob > 0.0 and np.random.rand() < self.scale_prob:
+            min_scale, max_scale = (
+                (self.lminscale, self.lmaxscale)
+                if min(h, w) < self.lhth
+                else (self.hminscale, self.hmaxscale)
+            )
+            scale_x = 2.0 ** np.random.uniform(min_scale, max_scale)
+            scale_x = np.clip(scale_x, (cw + 8) / float(w), None)
+            scale_y = 1.0
+            if not self.scale_xonly:
+                scale_y = scale_x
+                scale_y = np.clip(scale_y, (ch + 8) / float(h), None)
+            img1 = cv2.resize(
+                img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR
+            )
+            img2 = cv2.resize(
+                img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR
+            )
+            disp = (
+                cv2.resize(
+                    disp,
+                    None,
+                    fx=scale_x,
+                    fy=scale_y,
+                    interpolation=(
+                        cv2.INTER_LINEAR
+                        if not self.scale_interp_nearest
+                        else cv2.INTER_NEAREST
+                    ),
+                )
+                * scale_x
+            )
+        else:  # check if we need to resize to be able to crop
+            h, w = img1.shape[:2]
+            clip_scale = (cw + 8) / float(w)
+            if clip_scale > 1.0:
+                scale_x = clip_scale
+                scale_y = scale_x if not self.scale_xonly else 1.0
+                img1 = cv2.resize(
+                    img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR
+                )
+                img2 = cv2.resize(
+                    img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR
+                )
+                disp = (
+                    cv2.resize(
+                        disp,
+                        None,
+                        fx=scale_x,
+                        fy=scale_y,
+                        interpolation=(
+                            cv2.INTER_LINEAR
+                            if not self.scale_interp_nearest
+                            else cv2.INTER_NEAREST
+                        ),
+                    )
+                    * scale_x
+                )
+        return img1, img2, disp
+
+    def _random_crop(self, img1, img2, disp):
+        h, w = img1.shape[:2]
+        ch, cw = self.crop_size
+        assert ch <= h and cw <= w, (img1.shape, h, w, ch, cw)
+        offset_x = np.random.randint(w - cw + 1)
+        offset_y = np.random.randint(h - ch + 1)
+        img1 = img1[offset_y : offset_y + ch, offset_x : offset_x + cw]
+        img2 = img2[offset_y : offset_y + ch, offset_x : offset_x + cw]
+        disp = disp[offset_y : offset_y + ch, offset_x : offset_x + cw]
+        return img1, img2, disp
+
+    def _random_vflip(self, img1, img2, disp):
+        # vertical flip
+        if self.v_flip_prob > 0 and np.random.rand() < self.v_flip_prob:
+            img1 = np.copy(np.flipud(img1))
+            img2 = np.copy(np.flipud(img2))
+            disp = np.copy(np.flipud(disp))
+        return img1, img2, disp
+
+    def _random_rotate_shift_right(self, img2):
+        if self.rightjitterprob > 0.0 and np.random.rand() < self.rightjitterprob:
+            angle, pixel = 0.1, 2
+            px = np.random.uniform(-pixel, pixel)
+            ag = np.random.uniform(-angle, angle)
+            image_center = (
+                np.random.uniform(0, img2.shape[0]),
+                np.random.uniform(0, img2.shape[1]),
+            )
+            rot_mat = cv2.getRotationMatrix2D(image_center, ag, 1.0)
+            img2 = cv2.warpAffine(
+                img2, rot_mat, img2.shape[1::-1], flags=cv2.INTER_LINEAR
+            )
+            trans_mat = np.float32([[1, 0, 0], [0, 1, px]])
+            img2 = cv2.warpAffine(
+                img2, trans_mat, img2.shape[1::-1], flags=cv2.INTER_LINEAR
+            )
+        return img2
+
+    def _random_color_contrast(self, img1, img2):
+        if np.random.random() < 0.5:
+            contrast_factor = np.random.uniform(0.8, 1.2)
+            img1 = FF.adjust_contrast(img1, contrast_factor)
+            if self.color_aug_asym and np.random.random() < 0.5:
+                contrast_factor = np.random.uniform(0.8, 1.2)
+            img2 = FF.adjust_contrast(img2, contrast_factor)
+        return img1, img2
+
+    def _random_color_gamma(self, img1, img2):
+        if np.random.random() < 0.5:
+            gamma = np.random.uniform(0.7, 1.5)
+            img1 = FF.adjust_gamma(img1, gamma)
+            if self.color_aug_asym and np.random.random() < 0.5:
+                gamma = np.random.uniform(0.7, 1.5)
+            img2 = FF.adjust_gamma(img2, gamma)
+        return img1, img2
+
+    def _random_color_brightness(self, img1, img2):
+        if np.random.random() < 0.5:
+            brightness = np.random.uniform(0.5, 2.0)
+            img1 = FF.adjust_brightness(img1, brightness)
+            if self.color_aug_asym and np.random.random() < 0.5:
+                brightness = np.random.uniform(0.5, 2.0)
+            img2 = FF.adjust_brightness(img2, brightness)
+        return img1, img2
+
+    def _random_color_hue(self, img1, img2):
+        if np.random.random() < 0.5:
+            hue = np.random.uniform(-0.1, 0.1)
+            img1 = FF.adjust_hue(img1, hue)
+            if self.color_aug_asym and np.random.random() < 0.5:
+                hue = np.random.uniform(-0.1, 0.1)
+            img2 = FF.adjust_hue(img2, hue)
+        return img1, img2
+
+    def _random_color_saturation(self, img1, img2):
+        if np.random.random() < 0.5:
+            saturation = np.random.uniform(0.8, 1.2)
+            img1 = FF.adjust_saturation(img1, saturation)
+            if self.color_aug_asym and np.random.random() < 0.5:
+                saturation = np.random.uniform(-0.8, 1.2)
+            img2 = FF.adjust_saturation(img2, saturation)
+        return img1, img2
+
+    def _random_color(self, img1, img2):
+        trfs = [
+            self._random_color_contrast,
+            self._random_color_gamma,
+            self._random_color_brightness,
+            self._random_color_hue,
+            self._random_color_saturation,
+        ]
+        img1 = Image.fromarray(img1.astype("uint8"))
+        img2 = Image.fromarray(img2.astype("uint8"))
+        if np.random.random() < self.color_choice_prob:
+            # A single transform
+            t = random.choice(trfs)
+            img1, img2 = t(img1, img2)
+        else:
+            # Combination of trfs
+            # Random order
+            random.shuffle(trfs)
+            for t in trfs:
+                img1, img2 = t(img1, img2)
+        img1 = np.array(img1).astype(np.float32)
+        img2 = np.array(img2).astype(np.float32)
+        return img1, img2
+
+    def __call__(self, img1, img2, disp, dataset_name):
+        img1, img2, disp = self._random_scale(img1, img2, disp)
+        img1, img2, disp = self._random_crop(img1, img2, disp)
+        img1, img2, disp = self._random_vflip(img1, img2, disp)
+        img2 = self._random_rotate_shift_right(img2)
+        img1, img2 = self._random_color(img1, img2)
+        return img1, img2, disp
+
+
+class FlowAugmentor:
+
+    def __init__(
+        self,
+        crop_size,
+        min_scale=-0.2,
+        max_scale=0.5,
+        spatial_aug_prob=0.8,
+        stretch_prob=0.8,
+        max_stretch=0.2,
+        h_flip_prob=0.5,
+        v_flip_prob=0.1,
+        asymmetric_color_aug_prob=0.2,
+    ):
+
+        # spatial augmentation params
+        self.crop_size = crop_size
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.spatial_aug_prob = spatial_aug_prob
+        self.stretch_prob = stretch_prob
+        self.max_stretch = max_stretch
+
+        # flip augmentation params
+        self.h_flip_prob = h_flip_prob
+        self.v_flip_prob = v_flip_prob
+
+        # photometric augmentation params
+        self.photo_aug = ColorJitter(
+            brightness=0.4, contrast=0.4, saturation=0.4, hue=0.5 / 3.14
+        )
+
+        self.asymmetric_color_aug_prob = asymmetric_color_aug_prob
+
+    def color_transform(self, img1, img2):
+        """Photometric augmentation"""
+
+        # asymmetric
+        if np.random.rand() < self.asymmetric_color_aug_prob:
+            img1 = np.array(self.photo_aug(Image.fromarray(img1)), dtype=np.uint8)
+            img2 = np.array(self.photo_aug(Image.fromarray(img2)), dtype=np.uint8)
+
+        # symmetric
+        else:
+            image_stack = np.concatenate([img1, img2], axis=0)
+            image_stack = np.array(
+                self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8
+            )
+            img1, img2 = np.split(image_stack, 2, axis=0)
+
+        return img1, img2
+
+    def _resize_flow(self, flow, scale_x, scale_y, factor=1.0):
+        if np.all(np.isfinite(flow)):
+            flow = cv2.resize(
+                flow,
+                None,
+                fx=scale_x / factor,
+                fy=scale_y / factor,
+                interpolation=cv2.INTER_LINEAR,
+            )
+            flow = flow * [scale_x, scale_y]
+        else:  # sparse version
+            fx, fy = scale_x, scale_y
+            ht, wd = flow.shape[:2]
+            coords = np.meshgrid(np.arange(wd), np.arange(ht))
+            coords = np.stack(coords, axis=-1)
+
+            coords = coords.reshape(-1, 2).astype(np.float32)
+            flow = flow.reshape(-1, 2).astype(np.float32)
+            valid = np.isfinite(flow[:, 0])
+
+            coords0 = coords[valid]
+            flow0 = flow[valid]
+
+            ht1 = int(round(ht * fy / factor))
+            wd1 = int(round(wd * fx / factor))
+
+            rescale = np.expand_dims(np.array([fx, fy]), axis=0)
+            coords1 = coords0 * rescale / factor
+            flow1 = flow0 * rescale
+
+            xx = np.round(coords1[:, 0]).astype(np.int32)
+            yy = np.round(coords1[:, 1]).astype(np.int32)
+
+            v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1)
+            xx = xx[v]
+            yy = yy[v]
+            flow1 = flow1[v]
+
+            flow = np.inf * np.ones(
+                [ht1, wd1, 2], dtype=np.float32
+            )  # invalid value every where, before we fill it with the correct ones
+            flow[yy, xx] = flow1
+        return flow
+
+    def spatial_transform(self, img1, img2, flow, dname):
+
+        if np.random.rand() < self.spatial_aug_prob:
+            # randomly sample scale
+            ht, wd = img1.shape[:2]
+            clip_min_scale = np.maximum(
+                (self.crop_size[0] + 8) / float(ht), (self.crop_size[1] + 8) / float(wd)
+            )
+            min_scale, max_scale = self.min_scale, self.max_scale
+            scale = 2 ** np.random.uniform(self.min_scale, self.max_scale)
+            scale_x = scale
+            scale_y = scale
+            if np.random.rand() < self.stretch_prob:
+                scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
+                scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
+            scale_x = np.clip(scale_x, clip_min_scale, None)
+            scale_y = np.clip(scale_y, clip_min_scale, None)
+            # rescale the images
+            img1 = cv2.resize(
+                img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR
+            )
+            img2 = cv2.resize(
+                img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR
+            )
+            flow = self._resize_flow(
+                flow, scale_x, scale_y, factor=2.0 if dname == "Spring" else 1.0
+            )
+        elif dname == "Spring":
+            flow = self._resize_flow(flow, 1.0, 1.0, factor=2.0)
+
+        if self.h_flip_prob > 0.0 and np.random.rand() < self.h_flip_prob:  # h-flip
+            img1 = img1[:, ::-1]
+            img2 = img2[:, ::-1]
+            flow = flow[:, ::-1] * [-1.0, 1.0]
+
+        if self.v_flip_prob > 0.0 and np.random.rand() < self.v_flip_prob:  # v-flip
+            img1 = img1[::-1, :]
+            img2 = img2[::-1, :]
+            flow = flow[::-1, :] * [1.0, -1.0]
+
+        # In case no cropping
+        if img1.shape[0] - self.crop_size[0] > 0:
+            y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0])
+        else:
+            y0 = 0
+        if img1.shape[1] - self.crop_size[1] > 0:
+            x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1])
+        else:
+            x0 = 0
+
+        img1 = img1[y0 : y0 + self.crop_size[0], x0 : x0 + self.crop_size[1]]
+        img2 = img2[y0 : y0 + self.crop_size[0], x0 : x0 + self.crop_size[1]]
+        flow = flow[y0 : y0 + self.crop_size[0], x0 : x0 + self.crop_size[1]]
+
+        return img1, img2, flow
+
+    def __call__(self, img1, img2, flow, dname):
+        img1, img2, flow = self.spatial_transform(img1, img2, flow, dname)
+        img1, img2 = self.color_transform(img1, img2)
+        img1 = np.ascontiguousarray(img1)
+        img2 = np.ascontiguousarray(img2)
+        flow = np.ascontiguousarray(flow)
+        return img1, img2, flow
diff --git a/extern/CUT3R/src/croco/stereoflow/criterion.py b/extern/CUT3R/src/croco/stereoflow/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..f041240edb549e32f2eaa1123b07871deb322fd5
--- /dev/null
+++ b/extern/CUT3R/src/croco/stereoflow/criterion.py
@@ -0,0 +1,351 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Losses, metrics per batch, metrics per dataset
+# --------------------------------------------------------
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+
+def _get_gtnorm(gt):
+    if gt.size(1) == 1:  # stereo
+        return gt
+    # flow
+    return torch.sqrt(torch.sum(gt**2, dim=1, keepdims=True))  # Bx1xHxW
+
+
+############ losses without confidence
+
+
+class L1Loss(nn.Module):
+
+    def __init__(self, max_gtnorm=None):
+        super().__init__()
+        self.max_gtnorm = max_gtnorm
+        self.with_conf = False
+
+    def _error(self, gt, predictions):
+        return torch.abs(gt - predictions)
+
+    def forward(self, predictions, gt, inspect=False):
+        mask = torch.isfinite(gt)
+        if self.max_gtnorm is not None:
+            mask *= _get_gtnorm(gt).expand(-1, gt.size(1), -1, -1) < self.max_gtnorm
+        if inspect:
+            return self._error(gt, predictions)
+        return self._error(gt[mask], predictions[mask]).mean()
+
+
+############## losses with confience
+## there are several parametrizations
+
+
+class LaplacianLoss(nn.Module):  # used for CroCo-Stereo on ETH3D, d'=exp(d)
+
+    def __init__(self, max_gtnorm=None):
+        super().__init__()
+        self.max_gtnorm = max_gtnorm
+        self.with_conf = True
+
+    def forward(self, predictions, gt, conf):
+        mask = torch.isfinite(gt)
+        mask = mask[:, 0, :, :]
+        if self.max_gtnorm is not None:
+            mask *= _get_gtnorm(gt)[:, 0, :, :] < self.max_gtnorm
+        conf = conf.squeeze(1)
+        return (
+            torch.abs(gt - predictions).sum(dim=1)[mask] / torch.exp(conf[mask])
+            + conf[mask]
+        ).mean()  # + torch.log(2) => which is a constant
+
+
+class LaplacianLossBounded(
+    nn.Module
+):  # used for CroCo-Flow ; in the equation of the paper, we have a=1/b
+    def __init__(self, max_gtnorm=10000.0, a=0.25, b=4.0):
+        super().__init__()
+        self.max_gtnorm = max_gtnorm
+        self.with_conf = True
+        self.a, self.b = a, b
+
+    def forward(self, predictions, gt, conf):
+        mask = torch.isfinite(gt)
+        mask = mask[:, 0, :, :]
+        if self.max_gtnorm is not None:
+            mask *= _get_gtnorm(gt)[:, 0, :, :] < self.max_gtnorm
+        conf = conf.squeeze(1)
+        conf = (self.b - self.a) * torch.sigmoid(conf) + self.a
+        return (
+            torch.abs(gt - predictions).sum(dim=1)[mask] / conf[mask]
+            + torch.log(conf)[mask]
+        ).mean()  # + torch.log(2) => which is a constant
+
+
+class LaplacianLossBounded2(
+    nn.Module
+):  # used for CroCo-Stereo (except for ETH3D) ; in the equation of the paper, we have a=b
+    def __init__(self, max_gtnorm=None, a=3.0, b=3.0):
+        super().__init__()
+        self.max_gtnorm = max_gtnorm
+        self.with_conf = True
+        self.a, self.b = a, b
+
+    def forward(self, predictions, gt, conf):
+        mask = torch.isfinite(gt)
+        mask = mask[:, 0, :, :]
+        if self.max_gtnorm is not None:
+            mask *= _get_gtnorm(gt)[:, 0, :, :] < self.max_gtnorm
+        conf = conf.squeeze(1)
+        conf = 2 * self.a * (torch.sigmoid(conf / self.b) - 0.5)
+        return (
+            torch.abs(gt - predictions).sum(dim=1)[mask] / torch.exp(conf[mask])
+            + conf[mask]
+        ).mean()  # + torch.log(2) => which is a constant
+
+
+############## metrics per batch
+
+
+class StereoMetrics(nn.Module):
+
+    def __init__(self, do_quantile=False):
+        super().__init__()
+        self.bad_ths = [0.5, 1, 2, 3]
+        self.do_quantile = do_quantile
+
+    def forward(self, predictions, gt):
+        B = predictions.size(0)
+        metrics = {}
+        gtcopy = gt.clone()
+        mask = torch.isfinite(gtcopy)
+        gtcopy[~mask] = (
+            999999.0  # we make a copy and put a non-infinite value, such that it does not become nan once multiplied by the mask value 0
+        )
+        Npx = mask.view(B, -1).sum(dim=1)
+        L1error = (torch.abs(gtcopy - predictions) * mask).view(B, -1)
+        L2error = (torch.square(gtcopy - predictions) * mask).view(B, -1)
+        # avgerr
+        metrics["avgerr"] = torch.mean(L1error.sum(dim=1) / Npx)
+        # rmse
+        metrics["rmse"] = torch.sqrt(L2error.sum(dim=1) / Npx).mean(dim=0)
+        # err > t for t in [0.5,1,2,3]
+        for ths in self.bad_ths:
+            metrics["bad@{:.1f}".format(ths)] = (
+                ((L1error > ths) * mask.view(B, -1)).sum(dim=1) / Npx
+            ).mean(dim=0) * 100
+        return metrics
+
+
+class FlowMetrics(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.bad_ths = [1, 3, 5]
+
+    def forward(self, predictions, gt):
+        B = predictions.size(0)
+        metrics = {}
+        mask = torch.isfinite(gt[:, 0, :, :])  # both x and y would be infinite
+        Npx = mask.view(B, -1).sum(dim=1)
+        gtcopy = (
+            gt.clone()
+        )  # to compute L1/L2 error, we need to have non-infinite value, the error computed at this locations will be ignored
+        gtcopy[:, 0, :, :][~mask] = 999999.0
+        gtcopy[:, 1, :, :][~mask] = 999999.0
+        L1error = (torch.abs(gtcopy - predictions).sum(dim=1) * mask).view(B, -1)
+        L2error = (
+            torch.sqrt(torch.sum(torch.square(gtcopy - predictions), dim=1)) * mask
+        ).view(B, -1)
+        metrics["L1err"] = torch.mean(L1error.sum(dim=1) / Npx)
+        metrics["EPE"] = torch.mean(L2error.sum(dim=1) / Npx)
+        for ths in self.bad_ths:
+            metrics["bad@{:.1f}".format(ths)] = (
+                ((L2error > ths) * mask.view(B, -1)).sum(dim=1) / Npx
+            ).mean(dim=0) * 100
+        return metrics
+
+
+############## metrics per dataset
+## we update the average and maintain the number of pixels while adding data batch per batch
+## at the beggining, call reset()
+## after each batch, call add_batch(...)
+## at the end: call get_results()
+
+
+class StereoDatasetMetrics(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.bad_ths = [0.5, 1, 2, 3]
+
+    def reset(self):
+        self.agg_N = 0  # number of pixels so far
+        self.agg_L1err = torch.tensor(0.0)  # L1 error so far
+        self.agg_Nbad = [0 for _ in self.bad_ths]  # counter of bad pixels
+        self._metrics = None
+
+    def add_batch(self, predictions, gt):
+        assert predictions.size(1) == 1, predictions.size()
+        assert gt.size(1) == 1, gt.size()
+        if (
+            gt.size(2) == predictions.size(2) * 2
+            and gt.size(3) == predictions.size(3) * 2
+        ):  # special case for Spring ...
+            L1err = torch.minimum(
+                torch.minimum(
+                    torch.minimum(
+                        torch.sum(torch.abs(gt[:, :, 0::2, 0::2] - predictions), dim=1),
+                        torch.sum(torch.abs(gt[:, :, 1::2, 0::2] - predictions), dim=1),
+                    ),
+                    torch.sum(torch.abs(gt[:, :, 0::2, 1::2] - predictions), dim=1),
+                ),
+                torch.sum(torch.abs(gt[:, :, 1::2, 1::2] - predictions), dim=1),
+            )
+            valid = torch.isfinite(L1err)
+        else:
+            valid = torch.isfinite(gt[:, 0, :, :])  # both x and y would be infinite
+            L1err = torch.sum(torch.abs(gt - predictions), dim=1)
+        N = valid.sum()
+        Nnew = self.agg_N + N
+        self.agg_L1err = (
+            float(self.agg_N) / Nnew * self.agg_L1err
+            + L1err[valid].mean().cpu() * float(N) / Nnew
+        )
+        self.agg_N = Nnew
+        for i, th in enumerate(self.bad_ths):
+            self.agg_Nbad[i] += (L1err[valid] > th).sum().cpu()
+
+    def _compute_metrics(self):
+        if self._metrics is not None:
+            return
+        out = {}
+        out["L1err"] = self.agg_L1err.item()
+        for i, th in enumerate(self.bad_ths):
+            out["bad@{:.1f}".format(th)] = (
+                float(self.agg_Nbad[i]) / self.agg_N
+            ).item() * 100.0
+        self._metrics = out
+
+    def get_results(self):
+        self._compute_metrics()  # to avoid recompute them multiple times
+        return self._metrics
+
+
+class FlowDatasetMetrics(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.bad_ths = [0.5, 1, 3, 5]
+        self.speed_ths = [(0, 10), (10, 40), (40, torch.inf)]
+
+    def reset(self):
+        self.agg_N = 0  # number of pixels so far
+        self.agg_L1err = torch.tensor(0.0)  # L1 error so far
+        self.agg_L2err = torch.tensor(0.0)  # L2 (=EPE) error so far
+        self.agg_Nbad = [0 for _ in self.bad_ths]  # counter of bad pixels
+        self.agg_EPEspeed = [
+            torch.tensor(0.0) for _ in self.speed_ths
+        ]  # EPE per speed bin so far
+        self.agg_Nspeed = [0 for _ in self.speed_ths]  # N pixels per speed bin so far
+        self._metrics = None
+        self.pairname_results = {}
+
+    def add_batch(self, predictions, gt):
+        assert predictions.size(1) == 2, predictions.size()
+        assert gt.size(1) == 2, gt.size()
+        if (
+            gt.size(2) == predictions.size(2) * 2
+            and gt.size(3) == predictions.size(3) * 2
+        ):  # special case for Spring ...
+            L1err = torch.minimum(
+                torch.minimum(
+                    torch.minimum(
+                        torch.sum(torch.abs(gt[:, :, 0::2, 0::2] - predictions), dim=1),
+                        torch.sum(torch.abs(gt[:, :, 1::2, 0::2] - predictions), dim=1),
+                    ),
+                    torch.sum(torch.abs(gt[:, :, 0::2, 1::2] - predictions), dim=1),
+                ),
+                torch.sum(torch.abs(gt[:, :, 1::2, 1::2] - predictions), dim=1),
+            )
+            L2err = torch.minimum(
+                torch.minimum(
+                    torch.minimum(
+                        torch.sqrt(
+                            torch.sum(
+                                torch.square(gt[:, :, 0::2, 0::2] - predictions), dim=1
+                            )
+                        ),
+                        torch.sqrt(
+                            torch.sum(
+                                torch.square(gt[:, :, 1::2, 0::2] - predictions), dim=1
+                            )
+                        ),
+                    ),
+                    torch.sqrt(
+                        torch.sum(
+                            torch.square(gt[:, :, 0::2, 1::2] - predictions), dim=1
+                        )
+                    ),
+                ),
+                torch.sqrt(
+                    torch.sum(torch.square(gt[:, :, 1::2, 1::2] - predictions), dim=1)
+                ),
+            )
+            valid = torch.isfinite(L1err)
+            gtspeed = (
+                torch.sqrt(torch.sum(torch.square(gt[:, :, 0::2, 0::2]), dim=1))
+                + torch.sqrt(torch.sum(torch.square(gt[:, :, 0::2, 1::2]), dim=1))
+                + torch.sqrt(torch.sum(torch.square(gt[:, :, 1::2, 0::2]), dim=1))
+                + torch.sqrt(torch.sum(torch.square(gt[:, :, 1::2, 1::2]), dim=1))
+            ) / 4.0  # let's just average them
+        else:
+            valid = torch.isfinite(gt[:, 0, :, :])  # both x and y would be infinite
+            L1err = torch.sum(torch.abs(gt - predictions), dim=1)
+            L2err = torch.sqrt(torch.sum(torch.square(gt - predictions), dim=1))
+            gtspeed = torch.sqrt(torch.sum(torch.square(gt), dim=1))
+        N = valid.sum()
+        Nnew = self.agg_N + N
+        self.agg_L1err = (
+            float(self.agg_N) / Nnew * self.agg_L1err
+            + L1err[valid].mean().cpu() * float(N) / Nnew
+        )
+        self.agg_L2err = (
+            float(self.agg_N) / Nnew * self.agg_L2err
+            + L2err[valid].mean().cpu() * float(N) / Nnew
+        )
+        self.agg_N = Nnew
+        for i, th in enumerate(self.bad_ths):
+            self.agg_Nbad[i] += (L2err[valid] > th).sum().cpu()
+        for i, (th1, th2) in enumerate(self.speed_ths):
+            vv = (gtspeed[valid] >= th1) * (gtspeed[valid] < th2)
+            iNspeed = vv.sum()
+            if iNspeed == 0:
+                continue
+            iNnew = self.agg_Nspeed[i] + iNspeed
+            self.agg_EPEspeed[i] = (
+                float(self.agg_Nspeed[i]) / iNnew * self.agg_EPEspeed[i]
+                + float(iNspeed) / iNnew * L2err[valid][vv].mean().cpu()
+            )
+            self.agg_Nspeed[i] = iNnew
+
+    def _compute_metrics(self):
+        if self._metrics is not None:
+            return
+        out = {}
+        out["L1err"] = self.agg_L1err.item()
+        out["EPE"] = self.agg_L2err.item()
+        for i, th in enumerate(self.bad_ths):
+            out["bad@{:.1f}".format(th)] = (
+                float(self.agg_Nbad[i]) / self.agg_N
+            ).item() * 100.0
+        for i, (th1, th2) in enumerate(self.speed_ths):
+            out["s{:d}{:s}".format(th1, "-" + str(th2) if th2 < torch.inf else "+")] = (
+                self.agg_EPEspeed[i].item()
+            )
+        self._metrics = out
+
+    def get_results(self):
+        self._compute_metrics()  # to avoid recompute them multiple times
+        return self._metrics
diff --git a/extern/CUT3R/src/croco/stereoflow/datasets_flow.py b/extern/CUT3R/src/croco/stereoflow/datasets_flow.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5b1bc603b97a18e1245ec1756b74a9424d53ead
--- /dev/null
+++ b/extern/CUT3R/src/croco/stereoflow/datasets_flow.py
@@ -0,0 +1,936 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Dataset structure for flow
+# --------------------------------------------------------
+
+import os
+import os.path as osp
+import pickle
+import numpy as np
+import struct
+from PIL import Image
+import json
+import h5py
+import torch
+from torch.utils import data
+
+from .augmentor import FlowAugmentor
+from .datasets_stereo import _read_img, img_to_tensor, dataset_to_root, _read_pfm
+from copy import deepcopy
+
+dataset_to_root = deepcopy(dataset_to_root)
+
+dataset_to_root.update(
+    **{
+        "TartanAir": "./data/stereoflow/TartanAir",
+        "FlyingChairs": "./data/stereoflow/FlyingChairs/",
+        "FlyingThings": osp.join(dataset_to_root["SceneFlow"], "FlyingThings") + "/",
+        "MPISintel": "./data/stereoflow//MPI-Sintel/" + "/",
+    }
+)
+cache_dir = "./data/stereoflow/datasets_flow_cache/"
+
+
+def flow_to_tensor(disp):
+    return torch.from_numpy(disp).float().permute(2, 0, 1)
+
+
+class FlowDataset(data.Dataset):
+
+    def __init__(self, split, augmentor=False, crop_size=None, totensor=True):
+        self.split = split
+        if not augmentor:
+            assert crop_size is None
+        if crop_size is not None:
+            assert augmentor
+        self.crop_size = crop_size
+        self.augmentor_str = augmentor
+        self.augmentor = FlowAugmentor(crop_size) if augmentor else None
+        self.totensor = totensor
+        self.rmul = 1  # keep track of rmul
+        self.has_constant_resolution = True  # whether the dataset has constant resolution or not (=> don't use batch_size>1 at test time)
+        self._prepare_data()
+        self._load_or_build_cache()
+
+    def prepare_data(self):
+        """
+        to be defined for each dataset
+        """
+        raise NotImplementedError
+
+    def __len__(self):
+        return len(
+            self.pairnames
+        )  # each pairname is typically of the form (str, int1, int2)
+
+    def __getitem__(self, index):
+        pairname = self.pairnames[index]
+
+        # get filenames
+        img1name = self.pairname_to_img1name(pairname)
+        img2name = self.pairname_to_img2name(pairname)
+        flowname = (
+            self.pairname_to_flowname(pairname)
+            if self.pairname_to_flowname is not None
+            else None
+        )
+
+        # load images and disparities
+        img1 = _read_img(img1name)
+        img2 = _read_img(img2name)
+        flow = self.load_flow(flowname) if flowname is not None else None
+
+        # apply augmentations
+        if self.augmentor is not None:
+            img1, img2, flow = self.augmentor(img1, img2, flow, self.name)
+
+        if self.totensor:
+            img1 = img_to_tensor(img1)
+            img2 = img_to_tensor(img2)
+            if flow is not None:
+                flow = flow_to_tensor(flow)
+            else:
+                flow = torch.tensor(
+                    []
+                )  # to allow dataloader batching with default collate_gn
+            pairname = str(
+                pairname
+            )  # transform potential tuple to str to be able to batch it
+
+        return img1, img2, flow, pairname
+
+    def __rmul__(self, v):
+        self.rmul *= v
+        self.pairnames = v * self.pairnames
+        return self
+
+    def __str__(self):
+        return f"{self.__class__.__name__}_{self.split}"
+
+    def __repr__(self):
+        s = f"{self.__class__.__name__}(split={self.split}, augmentor={self.augmentor_str}, crop_size={str(self.crop_size)}, totensor={self.totensor})"
+        if self.rmul == 1:
+            s += f"\n\tnum pairs: {len(self.pairnames)}"
+        else:
+            s += f"\n\tnum pairs: {len(self.pairnames)} ({len(self.pairnames)//self.rmul}x{self.rmul})"
+        return s
+
+    def _set_root(self):
+        self.root = dataset_to_root[self.name]
+        assert os.path.isdir(
+            self.root
+        ), f"could not find root directory for dataset {self.name}: {self.root}"
+
+    def _load_or_build_cache(self):
+        cache_file = osp.join(cache_dir, self.name + ".pkl")
+        if osp.isfile(cache_file):
+            with open(cache_file, "rb") as fid:
+                self.pairnames = pickle.load(fid)[self.split]
+        else:
+            tosave = self._build_cache()
+            os.makedirs(cache_dir, exist_ok=True)
+            with open(cache_file, "wb") as fid:
+                pickle.dump(tosave, fid)
+            self.pairnames = tosave[self.split]
+
+
+class TartanAirDataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "TartanAir"
+        self._set_root()
+        assert self.split in ["train"]
+        self.pairname_to_img1name = lambda pairname: osp.join(
+            self.root, pairname[0], "image_left/{:06d}_left.png".format(pairname[1])
+        )
+        self.pairname_to_img2name = lambda pairname: osp.join(
+            self.root, pairname[0], "image_left/{:06d}_left.png".format(pairname[2])
+        )
+        self.pairname_to_flowname = lambda pairname: osp.join(
+            self.root,
+            pairname[0],
+            "flow/{:06d}_{:06d}_flow.npy".format(pairname[1], pairname[2]),
+        )
+        self.pairname_to_str = lambda pairname: os.path.join(
+            pairname[0][pairname[0].find("/") + 1 :],
+            "{:06d}_{:06d}".format(pairname[1], pairname[2]),
+        )
+        self.load_flow = _read_numpy_flow
+
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        pairs = [
+            (osp.join(s, s, difficulty, Pxxx), int(a[:6]), int(a[:6]) + 1)
+            for s in seqs
+            for difficulty in ["Easy", "Hard"]
+            for Pxxx in sorted(os.listdir(osp.join(self.root, s, s, difficulty)))
+            for a in sorted(
+                os.listdir(osp.join(self.root, s, s, difficulty, Pxxx, "image_left/"))
+            )[:-1]
+        ]
+        assert len(pairs) == 306268, "incorrect parsing of pairs in TartanAir"
+        tosave = {"train": pairs}
+        return tosave
+
+
+class FlyingChairsDataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "FlyingChairs"
+        self._set_root()
+        assert self.split in ["train", "val"]
+        self.pairname_to_img1name = lambda pairname: osp.join(
+            self.root, "data", pairname + "_img1.ppm"
+        )
+        self.pairname_to_img2name = lambda pairname: osp.join(
+            self.root, "data", pairname + "_img2.ppm"
+        )
+        self.pairname_to_flowname = lambda pairname: osp.join(
+            self.root, "data", pairname + "_flow.flo"
+        )
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_flow = _read_flo_file
+
+    def _build_cache(self):
+        split_file = osp.join(self.root, "chairs_split.txt")
+        split_list = np.loadtxt(split_file, dtype=np.int32)
+        trainpairs = ["{:05d}".format(i) for i in np.where(split_list == 1)[0] + 1]
+        valpairs = ["{:05d}".format(i) for i in np.where(split_list == 2)[0] + 1]
+        assert (
+            len(trainpairs) == 22232 and len(valpairs) == 640
+        ), "incorrect parsing of pairs in MPI-Sintel"
+        tosave = {"train": trainpairs, "val": valpairs}
+        return tosave
+
+
+class FlyingThingsDataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "FlyingThings"
+        self._set_root()
+        assert self.split in [
+            f"{set_}_{pass_}pass{camstr}"
+            for set_ in ["train", "test", "test1024"]
+            for camstr in ["", "_rightcam"]
+            for pass_ in ["clean", "final", "all"]
+        ]
+        self.pairname_to_img1name = lambda pairname: osp.join(
+            self.root,
+            f"frames_{pairname[3]}pass",
+            pairname[0].replace("into_future", "").replace("into_past", ""),
+            "{:04d}.png".format(pairname[1]),
+        )
+        self.pairname_to_img2name = lambda pairname: osp.join(
+            self.root,
+            f"frames_{pairname[3]}pass",
+            pairname[0].replace("into_future", "").replace("into_past", ""),
+            "{:04d}.png".format(pairname[2]),
+        )
+        self.pairname_to_flowname = lambda pairname: osp.join(
+            self.root,
+            "optical_flow",
+            pairname[0],
+            "OpticalFlowInto{f:s}_{i:04d}_{c:s}.pfm".format(
+                f="Future" if "future" in pairname[0] else "Past",
+                i=pairname[1],
+                c="L" if "left" in pairname[0] else "R",
+            ),
+        )
+        self.pairname_to_str = lambda pairname: os.path.join(
+            pairname[3] + "pass",
+            pairname[0],
+            "Into{f:s}_{i:04d}_{c:s}".format(
+                f="Future" if "future" in pairname[0] else "Past",
+                i=pairname[1],
+                c="L" if "left" in pairname[0] else "R",
+            ),
+        )
+        self.load_flow = _read_pfm_flow
+
+    def _build_cache(self):
+        tosave = {}
+        # train and test splits for the different passes
+        for set_ in ["train", "test"]:
+            sroot = osp.join(self.root, "optical_flow", set_.upper())
+            fname_to_i = lambda f: int(
+                f[len("OpticalFlowIntoFuture_") : -len("_L.pfm")]
+            )
+            pp = [
+                (osp.join(set_.upper(), d, s, "into_future/left"), fname_to_i(fname))
+                for d in sorted(os.listdir(sroot))
+                for s in sorted(os.listdir(osp.join(sroot, d)))
+                for fname in sorted(
+                    os.listdir(osp.join(sroot, d, s, "into_future/left"))
+                )[:-1]
+            ]
+            pairs = [(a, i, i + 1) for a, i in pp]
+            pairs += [(a.replace("into_future", "into_past"), i + 1, i) for a, i in pp]
+            assert (
+                len(pairs) == {"train": 40302, "test": 7866}[set_]
+            ), "incorrect parsing of pairs Flying Things"
+            for cam in ["left", "right"]:
+                camstr = "" if cam == "left" else f"_{cam}cam"
+                for pass_ in ["final", "clean"]:
+                    tosave[f"{set_}_{pass_}pass{camstr}"] = [
+                        (a.replace("left", cam), i, j, pass_) for a, i, j in pairs
+                    ]
+                tosave[f"{set_}_allpass{camstr}"] = (
+                    tosave[f"{set_}_cleanpass{camstr}"]
+                    + tosave[f"{set_}_finalpass{camstr}"]
+                )
+        # test1024: this is the same split as unimatch 'validation' split
+        # see https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/datasets.py#L229
+        test1024_nsamples = 1024
+        alltest_nsamples = len(tosave["test_cleanpass"])  # 7866
+        stride = alltest_nsamples // test1024_nsamples
+        remove = alltest_nsamples % test1024_nsamples
+        for cam in ["left", "right"]:
+            camstr = "" if cam == "left" else f"_{cam}cam"
+            for pass_ in ["final", "clean"]:
+                tosave[f"test1024_{pass_}pass{camstr}"] = sorted(
+                    tosave[f"test_{pass_}pass{camstr}"]
+                )[:-remove][
+                    ::stride
+                ]  # warning, it was not sorted before
+            assert (
+                len(tosave["test1024_cleanpass"]) == 1024
+            ), "incorrect parsing of pairs in Flying Things"
+            tosave[f"test1024_allpass{camstr}"] = (
+                tosave[f"test1024_cleanpass{camstr}"]
+                + tosave[f"test1024_finalpass{camstr}"]
+            )
+        return tosave
+
+
+class MPISintelDataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "MPISintel"
+        self._set_root()
+        assert self.split in [
+            s + "_" + p
+            for s in ["train", "test", "subval", "subtrain"]
+            for p in ["cleanpass", "finalpass", "allpass"]
+        ]
+        self.pairname_to_img1name = lambda pairname: osp.join(
+            self.root, pairname[0], "frame_{:04d}.png".format(pairname[1])
+        )
+        self.pairname_to_img2name = lambda pairname: osp.join(
+            self.root, pairname[0], "frame_{:04d}.png".format(pairname[1] + 1)
+        )
+        self.pairname_to_flowname = lambda pairname: (
+            None
+            if pairname[0].startswith("test/")
+            else osp.join(
+                self.root,
+                pairname[0].replace("/clean/", "/flow/").replace("/final/", "/flow/"),
+                "frame_{:04d}.flo".format(pairname[1]),
+            )
+        )
+        self.pairname_to_str = lambda pairname: osp.join(
+            pairname[0], "frame_{:04d}".format(pairname[1])
+        )
+        self.load_flow = _read_flo_file
+
+    def _build_cache(self):
+        trainseqs = sorted(os.listdir(self.root + "training/clean"))
+        trainpairs = [
+            (osp.join("training/clean", s), i)
+            for s in trainseqs
+            for i in range(1, len(os.listdir(self.root + "training/clean/" + s)))
+        ]
+        subvalseqs = ["temple_2", "temple_3"]
+        subtrainseqs = [s for s in trainseqs if s not in subvalseqs]
+        subvalpairs = [(p, i) for p, i in trainpairs if any(s in p for s in subvalseqs)]
+        subtrainpairs = [
+            (p, i) for p, i in trainpairs if any(s in p for s in subtrainseqs)
+        ]
+        testseqs = sorted(os.listdir(self.root + "test/clean"))
+        testpairs = [
+            (osp.join("test/clean", s), i)
+            for s in testseqs
+            for i in range(1, len(os.listdir(self.root + "test/clean/" + s)))
+        ]
+        assert (
+            len(trainpairs) == 1041
+            and len(testpairs) == 552
+            and len(subvalpairs) == 98
+            and len(subtrainpairs) == 943
+        ), "incorrect parsing of pairs in MPI-Sintel"
+        tosave = {}
+        tosave["train_cleanpass"] = trainpairs
+        tosave["test_cleanpass"] = testpairs
+        tosave["subval_cleanpass"] = subvalpairs
+        tosave["subtrain_cleanpass"] = subtrainpairs
+        for t in ["train", "test", "subval", "subtrain"]:
+            tosave[t + "_finalpass"] = [
+                (p.replace("/clean/", "/final/"), i)
+                for p, i in tosave[t + "_cleanpass"]
+            ]
+            tosave[t + "_allpass"] = tosave[t + "_cleanpass"] + tosave[t + "_finalpass"]
+        return tosave
+
+    def submission_save_pairname(self, pairname, prediction, outdir, _time):
+        assert prediction.shape[2] == 2
+        outfile = os.path.join(
+            outdir, "submission", self.pairname_to_str(pairname) + ".flo"
+        )
+        os.makedirs(os.path.dirname(outfile), exist_ok=True)
+        writeFlowFile(prediction, outfile)
+
+    def finalize_submission(self, outdir):
+        assert self.split == "test_allpass"
+        bundle_exe = "/nfs/data/ffs-3d/datasets/StereoFlow/MPI-Sintel/bundler/linux-x64/bundler"  # eg <bundle_exe> <path_to_results_for_clean> <path_to_results_for_final> <output/bundled.lzma>
+        if os.path.isfile(bundle_exe):
+            cmd = f'{bundle_exe} "{outdir}/submission/test/clean/" "{outdir}/submission/test/final" "{outdir}/submission/bundled.lzma"'
+            print(cmd)
+            os.system(cmd)
+            print(f'Done. Submission file at: "{outdir}/submission/bundled.lzma"')
+        else:
+            print("Could not find bundler executable for submission.")
+            print("Please download it and run:")
+            print(
+                f'<bundle_exe> "{outdir}/submission/test/clean/" "{outdir}/submission/test/final" "{outdir}/submission/bundled.lzma"'
+            )
+
+
+class SpringDataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "Spring"
+        self._set_root()
+        assert self.split in ["train", "test", "subtrain", "subval"]
+        self.pairname_to_img1name = lambda pairname: osp.join(
+            self.root,
+            pairname[0],
+            pairname[1],
+            "frame_" + pairname[3],
+            "frame_{:s}_{:04d}.png".format(pairname[3], pairname[4]),
+        )
+        self.pairname_to_img2name = lambda pairname: osp.join(
+            self.root,
+            pairname[0],
+            pairname[1],
+            "frame_" + pairname[3],
+            "frame_{:s}_{:04d}.png".format(
+                pairname[3], pairname[4] + (1 if pairname[2] == "FW" else -1)
+            ),
+        )
+        self.pairname_to_flowname = lambda pairname: (
+            None
+            if pairname[0] == "test"
+            else osp.join(
+                self.root,
+                pairname[0],
+                pairname[1],
+                f"flow_{pairname[2]}_{pairname[3]}",
+                f"flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}.flo5",
+            )
+        )
+        self.pairname_to_str = lambda pairname: osp.join(
+            pairname[0],
+            pairname[1],
+            f"flow_{pairname[2]}_{pairname[3]}",
+            f"flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}",
+        )
+        self.load_flow = _read_hdf5_flow
+
+    def _build_cache(self):
+        # train
+        trainseqs = sorted(os.listdir(osp.join(self.root, "train")))
+        trainpairs = []
+        for leftright in ["left", "right"]:
+            for fwbw in ["FW", "BW"]:
+                trainpairs += [
+                    (
+                        "train",
+                        s,
+                        fwbw,
+                        leftright,
+                        int(f[len(f"flow_{fwbw}_{leftright}_") : -len(".flo5")]),
+                    )
+                    for s in trainseqs
+                    for f in sorted(
+                        os.listdir(
+                            osp.join(self.root, "train", s, f"flow_{fwbw}_{leftright}")
+                        )
+                    )
+                ]
+        # test
+        testseqs = sorted(os.listdir(osp.join(self.root, "test")))
+        testpairs = []
+        for leftright in ["left", "right"]:
+            testpairs += [
+                (
+                    "test",
+                    s,
+                    "FW",
+                    leftright,
+                    int(f[len(f"frame_{leftright}_") : -len(".png")]),
+                )
+                for s in testseqs
+                for f in sorted(
+                    os.listdir(osp.join(self.root, "test", s, f"frame_{leftright}"))
+                )[:-1]
+            ]
+            testpairs += [
+                (
+                    "test",
+                    s,
+                    "BW",
+                    leftright,
+                    int(f[len(f"frame_{leftright}_") : -len(".png")]) + 1,
+                )
+                for s in testseqs
+                for f in sorted(
+                    os.listdir(osp.join(self.root, "test", s, f"frame_{leftright}"))
+                )[:-1]
+            ]
+        # subtrain / subval
+        subtrainpairs = [p for p in trainpairs if p[1] != "0041"]
+        subvalpairs = [p for p in trainpairs if p[1] == "0041"]
+        assert (
+            len(trainpairs) == 19852
+            and len(testpairs) == 3960
+            and len(subtrainpairs) == 19472
+            and len(subvalpairs) == 380
+        ), "incorrect parsing of pairs in Spring"
+        tosave = {
+            "train": trainpairs,
+            "test": testpairs,
+            "subtrain": subtrainpairs,
+            "subval": subvalpairs,
+        }
+        return tosave
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim == 3
+        assert prediction.shape[2] == 2
+        assert prediction.dtype == np.float32
+        outfile = osp.join(
+            outdir,
+            pairname[0],
+            pairname[1],
+            f"flow_{pairname[2]}_{pairname[3]}",
+            f"flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}.flo5",
+        )
+        os.makedirs(os.path.dirname(outfile), exist_ok=True)
+        writeFlo5File(prediction, outfile)
+
+    def finalize_submission(self, outdir):
+        assert self.split == "test"
+        exe = "{self.root}/flow_subsampling"
+        if os.path.isfile(exe):
+            cmd = f'cd "{outdir}/test"; {exe} .'
+            print(cmd)
+            os.system(cmd)
+            print(f"Done. Submission file at {outdir}/test/flow_submission.hdf5")
+        else:
+            print("Could not find flow_subsampling executable for submission.")
+            print("Please download it and run:")
+            print(f'cd "{outdir}/test"; <flow_subsampling_exe> .')
+
+
+class Kitti12Dataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "Kitti12"
+        self._set_root()
+        assert self.split in ["train", "test"]
+        self.pairname_to_img1name = lambda pairname: osp.join(
+            self.root, pairname + "_10.png"
+        )
+        self.pairname_to_img2name = lambda pairname: osp.join(
+            self.root, pairname + "_11.png"
+        )
+        self.pairname_to_flowname = (
+            None
+            if self.split == "test"
+            else lambda pairname: osp.join(
+                self.root, pairname.replace("/colored_0/", "/flow_occ/") + "_10.png"
+            )
+        )
+        self.pairname_to_str = lambda pairname: pairname.replace("/colored_0/", "/")
+        self.load_flow = _read_kitti_flow
+
+    def _build_cache(self):
+        trainseqs = ["training/colored_0/%06d" % (i) for i in range(194)]
+        testseqs = ["testing/colored_0/%06d" % (i) for i in range(195)]
+        assert (
+            len(trainseqs) == 194 and len(testseqs) == 195
+        ), "incorrect parsing of pairs in Kitti12"
+        tosave = {"train": trainseqs, "test": testseqs}
+        return tosave
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim == 3
+        assert prediction.shape[2] == 2
+        outfile = os.path.join(outdir, pairname.split("/")[-1] + "_10.png")
+        os.makedirs(os.path.dirname(outfile), exist_ok=True)
+        writeFlowKitti(outfile, prediction)
+
+    def finalize_submission(self, outdir):
+        assert self.split == "test"
+        cmd = f'cd {outdir}/; zip -r "kitti12_flow_results.zip" .'
+        print(cmd)
+        os.system(cmd)
+        print(f"Done. Submission file at {outdir}/kitti12_flow_results.zip")
+
+
+class Kitti15Dataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "Kitti15"
+        self._set_root()
+        assert self.split in ["train", "subtrain", "subval", "test"]
+        self.pairname_to_img1name = lambda pairname: osp.join(
+            self.root, pairname + "_10.png"
+        )
+        self.pairname_to_img2name = lambda pairname: osp.join(
+            self.root, pairname + "_11.png"
+        )
+        self.pairname_to_flowname = (
+            None
+            if self.split == "test"
+            else lambda pairname: osp.join(
+                self.root, pairname.replace("/image_2/", "/flow_occ/") + "_10.png"
+            )
+        )
+        self.pairname_to_str = lambda pairname: pairname.replace("/image_2/", "/")
+        self.load_flow = _read_kitti_flow
+
+    def _build_cache(self):
+        trainseqs = ["training/image_2/%06d" % (i) for i in range(200)]
+        subtrainseqs = trainseqs[:-10]
+        subvalseqs = trainseqs[-10:]
+        testseqs = ["testing/image_2/%06d" % (i) for i in range(200)]
+        assert (
+            len(trainseqs) == 200
+            and len(subtrainseqs) == 190
+            and len(subvalseqs) == 10
+            and len(testseqs) == 200
+        ), "incorrect parsing of pairs in Kitti15"
+        tosave = {
+            "train": trainseqs,
+            "subtrain": subtrainseqs,
+            "subval": subvalseqs,
+            "test": testseqs,
+        }
+        return tosave
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim == 3
+        assert prediction.shape[2] == 2
+        outfile = os.path.join(outdir, "flow", pairname.split("/")[-1] + "_10.png")
+        os.makedirs(os.path.dirname(outfile), exist_ok=True)
+        writeFlowKitti(outfile, prediction)
+
+    def finalize_submission(self, outdir):
+        assert self.split == "test"
+        cmd = f'cd {outdir}/; zip -r "kitti15_flow_results.zip" flow'
+        print(cmd)
+        os.system(cmd)
+        print(f"Done. Submission file at {outdir}/kitti15_flow_results.zip")
+
+
+import cv2
+
+
+def _read_numpy_flow(filename):
+    return np.load(filename)
+
+
+def _read_pfm_flow(filename):
+    f, _ = _read_pfm(filename)
+    assert np.all(f[:, :, 2] == 0.0)
+    return np.ascontiguousarray(f[:, :, :2])
+
+
+TAG_FLOAT = 202021.25  # tag to check the sanity of the file
+TAG_STRING = "PIEH"  # string containing the tag
+MIN_WIDTH = 1
+MAX_WIDTH = 99999
+MIN_HEIGHT = 1
+MAX_HEIGHT = 99999
+
+
+def readFlowFile(filename):
+    """
+    readFlowFile(<FILENAME>) reads a flow file <FILENAME> into a 2-band np.array.
+    if <FILENAME> does not exist, an IOError is raised.
+    if <FILENAME> does not finish by '.flo' or the tag, the width, the height or the file's size is illegal, an Expcetion is raised.
+    ---- PARAMETERS ----
+        filename: string containg the name of the file to read a flow
+    ---- OUTPUTS ----
+        a np.array of dimension (height x width x 2) containing the flow of type 'float32'
+    """
+
+    # check filename
+    if not filename.endswith(".flo"):
+        raise Exception(
+            "readFlowFile({:s}): filename must finish with '.flo'".format(filename)
+        )
+
+    # open the file and read it
+    with open(filename, "rb") as f:
+        # check tag
+        tag = struct.unpack("f", f.read(4))[0]
+        if tag != TAG_FLOAT:
+            raise Exception("flow_utils.readFlowFile({:s}): wrong tag".format(filename))
+        # read dimension
+        w, h = struct.unpack("ii", f.read(8))
+        if w < MIN_WIDTH or w > MAX_WIDTH:
+            raise Exception(
+                "flow_utils.readFlowFile({:s}: illegal width {:d}".format(filename, w)
+            )
+        if h < MIN_HEIGHT or h > MAX_HEIGHT:
+            raise Exception(
+                "flow_utils.readFlowFile({:s}: illegal height {:d}".format(filename, h)
+            )
+        flow = np.fromfile(f, "float32")
+        if not flow.shape == (h * w * 2,):
+            raise Exception(
+                "flow_utils.readFlowFile({:s}: illegal size of the file".format(
+                    filename
+                )
+            )
+        flow.shape = (h, w, 2)
+        return flow
+
+
+def writeFlowFile(flow, filename):
+    """
+    writeFlowFile(flow,<FILENAME>) write flow to the file <FILENAME>.
+    if <FILENAME> does not exist, an IOError is raised.
+    if <FILENAME> does not finish with '.flo' or the flow has not 2 bands, an Exception is raised.
+    ---- PARAMETERS ----
+        flow: np.array of dimension (height x width x 2) containing the flow to write
+        filename: string containg the name of the file to write a flow
+    """
+
+    # check filename
+    if not filename.endswith(".flo"):
+        raise Exception(
+            "flow_utils.writeFlowFile(<flow>,{:s}): filename must finish with '.flo'".format(
+                filename
+            )
+        )
+
+    if not flow.shape[2:] == (2,):
+        raise Exception(
+            "flow_utils.writeFlowFile(<flow>,{:s}): <flow> must have 2 bands".format(
+                filename
+            )
+        )
+
+    # open the file and write it
+    with open(filename, "wb") as f:
+        # write TAG
+        f.write(TAG_STRING.encode("utf-8"))
+        # write dimension
+        f.write(struct.pack("ii", flow.shape[1], flow.shape[0]))
+        # write the flow
+
+        flow.astype(np.float32).tofile(f)
+
+
+_read_flo_file = readFlowFile
+
+
+def _read_kitti_flow(filename):
+    flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)
+    flow = flow[:, :, ::-1].astype(np.float32)
+    valid = flow[:, :, 2] > 0
+    flow = flow[:, :, :2]
+    flow = (flow - 2**15) / 64.0
+    flow[~valid, 0] = np.inf
+    flow[~valid, 1] = np.inf
+    return flow
+
+
+_read_hd1k_flow = _read_kitti_flow
+
+
+def writeFlowKitti(filename, uv):
+    uv = 64.0 * uv + 2**15
+    valid = np.ones([uv.shape[0], uv.shape[1], 1])
+    uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16)
+    cv2.imwrite(filename, uv[..., ::-1])
+
+
+def writeFlo5File(flow, filename):
+    with h5py.File(filename, "w") as f:
+        f.create_dataset("flow", data=flow, compression="gzip", compression_opts=5)
+
+
+def _read_hdf5_flow(filename):
+    flow = np.asarray(h5py.File(filename)["flow"])
+    flow[np.isnan(flow)] = np.inf  # make invalid values as +inf
+    return flow.astype(np.float32)
+
+
+# flow visualization
+RY = 15
+YG = 6
+GC = 4
+CB = 11
+BM = 13
+MR = 6
+UNKNOWN_THRESH = 1e9
+
+
+def colorTest():
+    """
+    flow_utils.colorTest(): display an example of image showing the color encoding scheme
+    """
+    import matplotlib.pylab as plt
+
+    truerange = 1
+    h, w = 151, 151
+    trange = truerange * 1.04
+    s2 = round(h / 2)
+    x, y = np.meshgrid(range(w), range(h))
+    u = x * trange / s2 - trange
+    v = y * trange / s2 - trange
+    img = _computeColor(
+        np.concatenate((u[:, :, np.newaxis], v[:, :, np.newaxis]), 2)
+        / trange
+        / np.sqrt(2)
+    )
+    plt.imshow(img)
+    plt.axis("off")
+    plt.axhline(round(h / 2), color="k")
+    plt.axvline(round(w / 2), color="k")
+
+
+def flowToColor(flow, maxflow=None, maxmaxflow=None, saturate=False):
+    """
+    flow_utils.flowToColor(flow): return a color code flow field, normalized based on the maximum l2-norm of the flow
+    flow_utils.flowToColor(flow,maxflow): return a color code flow field, normalized by maxflow
+    ---- PARAMETERS ----
+        flow: flow to display of shape (height x width x 2)
+        maxflow (default:None): if given, normalize the flow by its value, otherwise by the flow norm
+        maxmaxflow (default:None): if given, normalize the flow by the max of its value and the flow norm
+    ---- OUTPUT ----
+        an np.array of shape (height x width x 3) of type uint8 containing a color code of the flow
+    """
+    h, w, n = flow.shape
+    # check size of flow
+    assert n == 2, "flow_utils.flowToColor(flow): flow must have 2 bands"
+    # fix unknown flow
+    unknown_idx = np.max(np.abs(flow), 2) > UNKNOWN_THRESH
+    flow[unknown_idx] = 0.0
+    # compute max flow if needed
+    if maxflow is None:
+        maxflow = flowMaxNorm(flow)
+    if maxmaxflow is not None:
+        maxflow = min(maxmaxflow, maxflow)
+    # normalize flow
+    eps = np.spacing(1)  # minimum positive float value to avoid division by 0
+    # compute the flow
+    img = _computeColor(flow / (maxflow + eps), saturate=saturate)
+    # put black pixels in unknown location
+    img[np.tile(unknown_idx[:, :, np.newaxis], [1, 1, 3])] = 0.0
+    return img
+
+
+def flowMaxNorm(flow):
+    """
+    flow_utils.flowMaxNorm(flow): return the maximum of the l2-norm of the given flow
+    ---- PARAMETERS ----
+        flow: the flow
+
+    ---- OUTPUT ----
+        a float containing the maximum of the l2-norm of the flow
+    """
+    return np.max(np.sqrt(np.sum(np.square(flow), 2)))
+
+
+def _computeColor(flow, saturate=True):
+    """
+    flow_utils._computeColor(flow): compute color codes for the flow field flow
+
+    ---- PARAMETERS ----
+        flow: np.array of dimension (height x width x 2) containing the flow to display
+    ---- OUTPUTS ----
+        an np.array of dimension (height x width x 3) containing the color conversion of the flow
+    """
+    # set nan to 0
+    nanidx = np.isnan(flow[:, :, 0])
+    flow[nanidx] = 0.0
+
+    # colorwheel
+    ncols = RY + YG + GC + CB + BM + MR
+    nchans = 3
+    colorwheel = np.zeros((ncols, nchans), "uint8")
+    col = 0
+    # RY
+    colorwheel[:RY, 0] = 255
+    colorwheel[:RY, 1] = [(255 * i) // RY for i in range(RY)]
+    col += RY
+    # YG
+    colorwheel[col : col + YG, 0] = [255 - (255 * i) // YG for i in range(YG)]
+    colorwheel[col : col + YG, 1] = 255
+    col += YG
+    # GC
+    colorwheel[col : col + GC, 1] = 255
+    colorwheel[col : col + GC, 2] = [(255 * i) // GC for i in range(GC)]
+    col += GC
+    # CB
+    colorwheel[col : col + CB, 1] = [255 - (255 * i) // CB for i in range(CB)]
+    colorwheel[col : col + CB, 2] = 255
+    col += CB
+    # BM
+    colorwheel[col : col + BM, 0] = [(255 * i) // BM for i in range(BM)]
+    colorwheel[col : col + BM, 2] = 255
+    col += BM
+    # MR
+    colorwheel[col : col + MR, 0] = 255
+    colorwheel[col : col + MR, 2] = [255 - (255 * i) // MR for i in range(MR)]
+
+    # compute utility variables
+    rad = np.sqrt(np.sum(np.square(flow), 2))  # magnitude
+    a = np.arctan2(-flow[:, :, 1], -flow[:, :, 0]) / np.pi  # angle
+    fk = (a + 1) / 2 * (ncols - 1)  # map [-1,1] to [0,ncols-1]
+    k0 = np.floor(fk).astype("int")
+    k1 = k0 + 1
+    k1[k1 == ncols] = 0
+    f = fk - k0
+
+    if not saturate:
+        rad = np.minimum(rad, 1)
+
+    # compute the image
+    img = np.zeros((flow.shape[0], flow.shape[1], nchans), "uint8")
+    for i in range(nchans):
+        tmp = colorwheel[:, i].astype("float")
+        col0 = tmp[k0] / 255
+        col1 = tmp[k1] / 255
+        col = (1 - f) * col0 + f * col1
+        idx = rad <= 1
+        col[idx] = 1 - rad[idx] * (1 - col[idx])  # increase saturation with radius
+        col[~idx] *= 0.75  # out of range
+        img[:, :, i] = (255 * col * (1 - nanidx.astype("float"))).astype("uint8")
+
+    return img
+
+
+# flow dataset getter
+
+
+def get_train_dataset_flow(dataset_str, augmentor=True, crop_size=None):
+    dataset_str = dataset_str.replace("(", "Dataset(")
+    if augmentor:
+        dataset_str = dataset_str.replace(")", ", augmentor=True)")
+    if crop_size is not None:
+        dataset_str = dataset_str.replace(
+            ")", ", crop_size={:s})".format(str(crop_size))
+        )
+    return eval(dataset_str)
+
+
+def get_test_datasets_flow(dataset_str):
+    dataset_str = dataset_str.replace("(", "Dataset(")
+    return [eval(s) for s in dataset_str.split("+")]
diff --git a/extern/CUT3R/src/croco/stereoflow/datasets_stereo.py b/extern/CUT3R/src/croco/stereoflow/datasets_stereo.py
new file mode 100644
index 0000000000000000000000000000000000000000..60c9466ad05164fb433551dd23acb3153e6e7ea6
--- /dev/null
+++ b/extern/CUT3R/src/croco/stereoflow/datasets_stereo.py
@@ -0,0 +1,991 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Dataset structure for stereo
+# --------------------------------------------------------
+
+import sys, os
+import os.path as osp
+import pickle
+import numpy as np
+from PIL import Image
+import json
+import h5py
+from glob import glob
+import cv2
+
+import torch
+from torch.utils import data
+
+from .augmentor import StereoAugmentor
+
+
+dataset_to_root = {
+    "CREStereo": "./data/stereoflow//crenet_stereo_trainset/stereo_trainset/crestereo/",
+    "SceneFlow": "./data/stereoflow//SceneFlow/",
+    "ETH3DLowRes": "./data/stereoflow/eth3d_lowres/",
+    "Booster": "./data/stereoflow/booster_gt/",
+    "Middlebury2021": "./data/stereoflow/middlebury/2021/data/",
+    "Middlebury2014": "./data/stereoflow/middlebury/2014/",
+    "Middlebury2006": "./data/stereoflow/middlebury/2006/",
+    "Middlebury2005": "./data/stereoflow/middlebury/2005/train/",
+    "MiddleburyEval3": "./data/stereoflow/middlebury/MiddEval3/",
+    "Spring": "./data/stereoflow/spring/",
+    "Kitti15": "./data/stereoflow/kitti-stereo-2015/",
+    "Kitti12": "./data/stereoflow/kitti-stereo-2012/",
+}
+cache_dir = "./data/stereoflow/datasets_stereo_cache/"
+
+
+in1k_mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
+in1k_std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
+
+
+def img_to_tensor(img):
+    img = torch.from_numpy(img).permute(2, 0, 1).float() / 255.0
+    img = (img - in1k_mean) / in1k_std
+    return img
+
+
+def disp_to_tensor(disp):
+    return torch.from_numpy(disp)[None, :, :]
+
+
+class StereoDataset(data.Dataset):
+
+    def __init__(self, split, augmentor=False, crop_size=None, totensor=True):
+        self.split = split
+        if not augmentor:
+            assert crop_size is None
+        if crop_size:
+            assert augmentor
+        self.crop_size = crop_size
+        self.augmentor_str = augmentor
+        self.augmentor = StereoAugmentor(crop_size) if augmentor else None
+        self.totensor = totensor
+        self.rmul = 1  # keep track of rmul
+        self.has_constant_resolution = True  # whether the dataset has constant resolution or not (=> don't use batch_size>1 at test time)
+        self._prepare_data()
+        self._load_or_build_cache()
+
+    def prepare_data(self):
+        """
+        to be defined for each dataset
+        """
+        raise NotImplementedError
+
+    def __len__(self):
+        return len(self.pairnames)
+
+    def __getitem__(self, index):
+        pairname = self.pairnames[index]
+
+        # get filenames
+        Limgname = self.pairname_to_Limgname(pairname)
+        Rimgname = self.pairname_to_Rimgname(pairname)
+        Ldispname = (
+            self.pairname_to_Ldispname(pairname)
+            if self.pairname_to_Ldispname is not None
+            else None
+        )
+
+        # load images and disparities
+        Limg = _read_img(Limgname)
+        Rimg = _read_img(Rimgname)
+        disp = self.load_disparity(Ldispname) if Ldispname is not None else None
+
+        # sanity check
+        if disp is not None:
+            assert np.all(disp > 0) or self.name == "Spring", (
+                self.name,
+                pairname,
+                Ldispname,
+            )
+
+        # apply augmentations
+        if self.augmentor is not None:
+            Limg, Rimg, disp = self.augmentor(Limg, Rimg, disp, self.name)
+
+        if self.totensor:
+            Limg = img_to_tensor(Limg)
+            Rimg = img_to_tensor(Rimg)
+            if disp is None:
+                disp = torch.tensor(
+                    []
+                )  # to allow dataloader batching with default collate_gn
+            else:
+                disp = disp_to_tensor(disp)
+
+        return Limg, Rimg, disp, str(pairname)
+
+    def __rmul__(self, v):
+        self.rmul *= v
+        self.pairnames = v * self.pairnames
+        return self
+
+    def __str__(self):
+        return f"{self.__class__.__name__}_{self.split}"
+
+    def __repr__(self):
+        s = f"{self.__class__.__name__}(split={self.split}, augmentor={self.augmentor_str}, crop_size={str(self.crop_size)}, totensor={self.totensor})"
+        if self.rmul == 1:
+            s += f"\n\tnum pairs: {len(self.pairnames)}"
+        else:
+            s += f"\n\tnum pairs: {len(self.pairnames)} ({len(self.pairnames)//self.rmul}x{self.rmul})"
+        return s
+
+    def _set_root(self):
+        self.root = dataset_to_root[self.name]
+        assert os.path.isdir(
+            self.root
+        ), f"could not find root directory for dataset {self.name}: {self.root}"
+
+    def _load_or_build_cache(self):
+        cache_file = osp.join(cache_dir, self.name + ".pkl")
+        if osp.isfile(cache_file):
+            with open(cache_file, "rb") as fid:
+                self.pairnames = pickle.load(fid)[self.split]
+        else:
+            tosave = self._build_cache()
+            os.makedirs(cache_dir, exist_ok=True)
+            with open(cache_file, "wb") as fid:
+                pickle.dump(tosave, fid)
+            self.pairnames = tosave[self.split]
+
+
+class CREStereoDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "CREStereo"
+        self._set_root()
+        assert self.split in ["train"]
+        self.pairname_to_Limgname = lambda pairname: osp.join(
+            self.root, pairname + "_left.jpg"
+        )
+        self.pairname_to_Rimgname = lambda pairname: osp.join(
+            self.root, pairname + "_right.jpg"
+        )
+        self.pairname_to_Ldispname = lambda pairname: osp.join(
+            self.root, pairname + "_left.disp.png"
+        )
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_disparity = _read_crestereo_disp
+
+    def _build_cache(self):
+        allpairs = [
+            s + "/" + f[: -len("_left.jpg")]
+            for s in sorted(os.listdir(self.root))
+            for f in sorted(os.listdir(self.root + "/" + s))
+            if f.endswith("_left.jpg")
+        ]
+        assert len(allpairs) == 200000, "incorrect parsing of pairs in CreStereo"
+        tosave = {"train": allpairs}
+        return tosave
+
+
+class SceneFlowDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "SceneFlow"
+        self._set_root()
+        assert self.split in [
+            "train_finalpass",
+            "train_cleanpass",
+            "train_allpass",
+            "test_finalpass",
+            "test_cleanpass",
+            "test_allpass",
+            "test1of100_cleanpass",
+            "test1of100_finalpass",
+        ]
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(
+            self.root, pairname
+        ).replace("/left/", "/right/")
+        self.pairname_to_Ldispname = (
+            lambda pairname: osp.join(self.root, pairname)
+            .replace("/frames_finalpass/", "/disparity/")
+            .replace("/frames_cleanpass/", "/disparity/")[:-4]
+            + ".pfm"
+        )
+        self.pairname_to_str = lambda pairname: pairname[:-4]
+        self.load_disparity = _read_sceneflow_disp
+
+    def _build_cache(self):
+        trainpairs = []
+        # driving
+        pairs = sorted(glob(self.root + "Driving/frames_finalpass/*/*/*/left/*.png"))
+        pairs = list(map(lambda x: x[len(self.root) :], pairs))
+        assert len(pairs) == 4400, "incorrect parsing of pairs in SceneFlow"
+        trainpairs += pairs
+        # monkaa
+        pairs = sorted(glob(self.root + "Monkaa/frames_finalpass/*/left/*.png"))
+        pairs = list(map(lambda x: x[len(self.root) :], pairs))
+        assert len(pairs) == 8664, "incorrect parsing of pairs in SceneFlow"
+        trainpairs += pairs
+        # flyingthings
+        pairs = sorted(
+            glob(self.root + "FlyingThings/frames_finalpass/TRAIN/*/*/left/*.png")
+        )
+        pairs = list(map(lambda x: x[len(self.root) :], pairs))
+        assert len(pairs) == 22390, "incorrect parsing of pairs in SceneFlow"
+        trainpairs += pairs
+        assert len(trainpairs) == 35454, "incorrect parsing of pairs in SceneFlow"
+        testpairs = sorted(
+            glob(self.root + "FlyingThings/frames_finalpass/TEST/*/*/left/*.png")
+        )
+        testpairs = list(map(lambda x: x[len(self.root) :], testpairs))
+        assert len(testpairs) == 4370, "incorrect parsing of pairs in SceneFlow"
+        test1of100pairs = testpairs[::100]
+        assert len(test1of100pairs) == 44, "incorrect parsing of pairs in SceneFlow"
+        # all
+        tosave = {
+            "train_finalpass": trainpairs,
+            "train_cleanpass": list(
+                map(
+                    lambda x: x.replace("frames_finalpass", "frames_cleanpass"),
+                    trainpairs,
+                )
+            ),
+            "test_finalpass": testpairs,
+            "test_cleanpass": list(
+                map(
+                    lambda x: x.replace("frames_finalpass", "frames_cleanpass"),
+                    testpairs,
+                )
+            ),
+            "test1of100_finalpass": test1of100pairs,
+            "test1of100_cleanpass": list(
+                map(
+                    lambda x: x.replace("frames_finalpass", "frames_cleanpass"),
+                    test1of100pairs,
+                )
+            ),
+        }
+        tosave["train_allpass"] = tosave["train_finalpass"] + tosave["train_cleanpass"]
+        tosave["test_allpass"] = tosave["test_finalpass"] + tosave["test_cleanpass"]
+        return tosave
+
+
+class Md21Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Middlebury2021"
+        self._set_root()
+        assert self.split in ["train", "subtrain", "subval"]
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(
+            self.root, pairname.replace("/im0", "/im1")
+        )
+        self.pairname_to_Ldispname = lambda pairname: osp.join(
+            self.root, pairname.split("/")[0], "disp0.pfm"
+        )
+        self.pairname_to_str = lambda pairname: pairname[:-4]
+        self.load_disparity = _read_middlebury_disp
+
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        trainpairs = []
+        for s in seqs:
+            # trainpairs += [s+'/im0.png'] # we should remove it, it is included as such in other lightings
+            trainpairs += [
+                s + "/ambient/" + b + "/" + a
+                for b in sorted(os.listdir(osp.join(self.root, s, "ambient")))
+                for a in sorted(os.listdir(osp.join(self.root, s, "ambient", b)))
+                if a.startswith("im0")
+            ]
+        assert len(trainpairs) == 355
+        subtrainpairs = [
+            p for p in trainpairs if any(p.startswith(s + "/") for s in seqs[:-2])
+        ]
+        subvalpairs = [
+            p for p in trainpairs if any(p.startswith(s + "/") for s in seqs[-2:])
+        ]
+        assert (
+            len(subtrainpairs) == 335 and len(subvalpairs) == 20
+        ), "incorrect parsing of pairs in Middlebury 2021"
+        tosave = {"train": trainpairs, "subtrain": subtrainpairs, "subval": subvalpairs}
+        return tosave
+
+
+class Md14Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Middlebury2014"
+        self._set_root()
+        assert self.split in ["train", "subtrain", "subval"]
+        self.pairname_to_Limgname = lambda pairname: osp.join(
+            self.root, osp.dirname(pairname), "im0.png"
+        )
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Ldispname = lambda pairname: osp.join(
+            self.root, osp.dirname(pairname), "disp0.pfm"
+        )
+        self.pairname_to_str = lambda pairname: pairname[:-4]
+        self.load_disparity = _read_middlebury_disp
+        self.has_constant_resolution = False
+
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        trainpairs = []
+        for s in seqs:
+            trainpairs += [s + "/im1.png", s + "/im1E.png", s + "/im1L.png"]
+        assert len(trainpairs) == 138
+        valseqs = ["Umbrella-imperfect", "Vintage-perfect"]
+        assert all(s in seqs for s in valseqs)
+        subtrainpairs = [
+            p for p in trainpairs if not any(p.startswith(s + "/") for s in valseqs)
+        ]
+        subvalpairs = [
+            p for p in trainpairs if any(p.startswith(s + "/") for s in valseqs)
+        ]
+        assert (
+            len(subtrainpairs) == 132 and len(subvalpairs) == 6
+        ), "incorrect parsing of pairs in Middlebury 2014"
+        tosave = {"train": trainpairs, "subtrain": subtrainpairs, "subval": subvalpairs}
+        return tosave
+
+
+class Md06Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Middlebury2006"
+        self._set_root()
+        assert self.split in ["train", "subtrain", "subval"]
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(
+            self.root, osp.dirname(pairname), "view5.png"
+        )
+        self.pairname_to_Ldispname = lambda pairname: osp.join(
+            self.root, pairname.split("/")[0], "disp1.png"
+        )
+        self.load_disparity = _read_middlebury20052006_disp
+        self.has_constant_resolution = False
+
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        trainpairs = []
+        for s in seqs:
+            for i in ["Illum1", "Illum2", "Illum3"]:
+                for e in ["Exp0", "Exp1", "Exp2"]:
+                    trainpairs.append(osp.join(s, i, e, "view1.png"))
+        assert len(trainpairs) == 189
+        valseqs = ["Rocks1", "Wood2"]
+        assert all(s in seqs for s in valseqs)
+        subtrainpairs = [
+            p for p in trainpairs if not any(p.startswith(s + "/") for s in valseqs)
+        ]
+        subvalpairs = [
+            p for p in trainpairs if any(p.startswith(s + "/") for s in valseqs)
+        ]
+        assert (
+            len(subtrainpairs) == 171 and len(subvalpairs) == 18
+        ), "incorrect parsing of pairs in Middlebury 2006"
+        tosave = {"train": trainpairs, "subtrain": subtrainpairs, "subval": subvalpairs}
+        return tosave
+
+
+class Md05Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Middlebury2005"
+        self._set_root()
+        assert self.split in ["train", "subtrain", "subval"]
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(
+            self.root, osp.dirname(pairname), "view5.png"
+        )
+        self.pairname_to_Ldispname = lambda pairname: osp.join(
+            self.root, pairname.split("/")[0], "disp1.png"
+        )
+        self.pairname_to_str = lambda pairname: pairname[:-4]
+        self.load_disparity = _read_middlebury20052006_disp
+
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        trainpairs = []
+        for s in seqs:
+            for i in ["Illum1", "Illum2", "Illum3"]:
+                for e in ["Exp0", "Exp1", "Exp2"]:
+                    trainpairs.append(osp.join(s, i, e, "view1.png"))
+        assert len(trainpairs) == 54, "incorrect parsing of pairs in Middlebury 2005"
+        valseqs = ["Reindeer"]
+        assert all(s in seqs for s in valseqs)
+        subtrainpairs = [
+            p for p in trainpairs if not any(p.startswith(s + "/") for s in valseqs)
+        ]
+        subvalpairs = [
+            p for p in trainpairs if any(p.startswith(s + "/") for s in valseqs)
+        ]
+        assert (
+            len(subtrainpairs) == 45 and len(subvalpairs) == 9
+        ), "incorrect parsing of pairs in Middlebury 2005"
+        tosave = {"train": trainpairs, "subtrain": subtrainpairs, "subval": subvalpairs}
+        return tosave
+
+
+class MdEval3Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "MiddleburyEval3"
+        self._set_root()
+        assert self.split in [
+            s + "_" + r
+            for s in ["train", "subtrain", "subval", "test", "all"]
+            for r in ["full", "half", "quarter"]
+        ]
+        if self.split.endswith("_full"):
+            self.root = self.root.replace("/MiddEval3", "/MiddEval3_F")
+        elif self.split.endswith("_half"):
+            self.root = self.root.replace("/MiddEval3", "/MiddEval3_H")
+        else:
+            assert self.split.endswith("_quarter")
+        self.pairname_to_Limgname = lambda pairname: osp.join(
+            self.root, pairname, "im0.png"
+        )
+        self.pairname_to_Rimgname = lambda pairname: osp.join(
+            self.root, pairname, "im1.png"
+        )
+        self.pairname_to_Ldispname = lambda pairname: (
+            None
+            if pairname.startswith("test")
+            else osp.join(self.root, pairname, "disp0GT.pfm")
+        )
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_disparity = _read_middlebury_disp
+        # for submission only
+        self.submission_methodname = "CroCo-Stereo"
+        self.submission_sresolution = (
+            "F"
+            if self.split.endswith("_full")
+            else ("H" if self.split.endswith("_half") else "Q")
+        )
+
+    def _build_cache(self):
+        trainpairs = ["train/" + s for s in sorted(os.listdir(self.root + "train/"))]
+        testpairs = ["test/" + s for s in sorted(os.listdir(self.root + "test/"))]
+        subvalpairs = trainpairs[-1:]
+        subtrainpairs = trainpairs[:-1]
+        allpairs = trainpairs + testpairs
+        assert (
+            len(trainpairs) == 15
+            and len(testpairs) == 15
+            and len(subvalpairs) == 1
+            and len(subtrainpairs) == 14
+            and len(allpairs) == 30
+        ), "incorrect parsing of pairs in Middlebury Eval v3"
+        tosave = {}
+        for r in ["full", "half", "quarter"]:
+            tosave.update(
+                **{
+                    "train_" + r: trainpairs,
+                    "subtrain_" + r: subtrainpairs,
+                    "subval_" + r: subvalpairs,
+                    "test_" + r: testpairs,
+                    "all_" + r: allpairs,
+                }
+            )
+        return tosave
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim == 2
+        assert prediction.dtype == np.float32
+        outfile = os.path.join(
+            outdir,
+            pairname.split("/")[0].replace("train", "training")
+            + self.submission_sresolution,
+            pairname.split("/")[1],
+            "disp0" + self.submission_methodname + ".pfm",
+        )
+        os.makedirs(os.path.dirname(outfile), exist_ok=True)
+        writePFM(outfile, prediction)
+        timefile = os.path.join(
+            os.path.dirname(outfile), "time" + self.submission_methodname + ".txt"
+        )
+        with open(timefile, "w") as fid:
+            fid.write(str(time))
+
+    def finalize_submission(self, outdir):
+        cmd = f'cd {outdir}/; zip -r "{self.submission_methodname}.zip" .'
+        print(cmd)
+        os.system(cmd)
+        print(f"Done. Submission file at {outdir}/{self.submission_methodname}.zip")
+
+
+class ETH3DLowResDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "ETH3DLowRes"
+        self._set_root()
+        assert self.split in ["train", "test", "subtrain", "subval", "all"]
+        self.pairname_to_Limgname = lambda pairname: osp.join(
+            self.root, pairname, "im0.png"
+        )
+        self.pairname_to_Rimgname = lambda pairname: osp.join(
+            self.root, pairname, "im1.png"
+        )
+        self.pairname_to_Ldispname = (
+            None
+            if self.split == "test"
+            else lambda pairname: (
+                None
+                if pairname.startswith("test/")
+                else osp.join(
+                    self.root, pairname.replace("train/", "train_gt/"), "disp0GT.pfm"
+                )
+            )
+        )
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_disparity = _read_eth3d_disp
+        self.has_constant_resolution = False
+
+    def _build_cache(self):
+        trainpairs = ["train/" + s for s in sorted(os.listdir(self.root + "train/"))]
+        testpairs = ["test/" + s for s in sorted(os.listdir(self.root + "test/"))]
+        assert (
+            len(trainpairs) == 27 and len(testpairs) == 20
+        ), "incorrect parsing of pairs in ETH3D Low Res"
+        subvalpairs = [
+            "train/delivery_area_3s",
+            "train/electro_3l",
+            "train/playground_3l",
+        ]
+        assert all(p in trainpairs for p in subvalpairs)
+        subtrainpairs = [p for p in trainpairs if not p in subvalpairs]
+        assert (
+            len(subvalpairs) == 3 and len(subtrainpairs) == 24
+        ), "incorrect parsing of pairs in ETH3D Low Res"
+        tosave = {
+            "train": trainpairs,
+            "test": testpairs,
+            "subtrain": subtrainpairs,
+            "subval": subvalpairs,
+            "all": trainpairs + testpairs,
+        }
+        return tosave
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim == 2
+        assert prediction.dtype == np.float32
+        outfile = os.path.join(
+            outdir, "low_res_two_view", pairname.split("/")[1] + ".pfm"
+        )
+        os.makedirs(os.path.dirname(outfile), exist_ok=True)
+        writePFM(outfile, prediction)
+        timefile = outfile[:-4] + ".txt"
+        with open(timefile, "w") as fid:
+            fid.write("runtime " + str(time))
+
+    def finalize_submission(self, outdir):
+        cmd = f'cd {outdir}/; zip -r "eth3d_low_res_two_view_results.zip" low_res_two_view'
+        print(cmd)
+        os.system(cmd)
+        print(f"Done. Submission file at {outdir}/eth3d_low_res_two_view_results.zip")
+
+
+class BoosterDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Booster"
+        self._set_root()
+        assert self.split in [
+            "train_balanced",
+            "test_balanced",
+            "subtrain_balanced",
+            "subval_balanced",
+        ]  # we use only the balanced version
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(
+            self.root, pairname
+        ).replace("/camera_00/", "/camera_02/")
+        self.pairname_to_Ldispname = lambda pairname: osp.join(
+            self.root, osp.dirname(pairname), "../disp_00.npy"
+        )  # same images with different colors, same gt per sequence
+        self.pairname_to_str = lambda pairname: pairname[:-4].replace(
+            "/camera_00/", "/"
+        )
+        self.load_disparity = _read_booster_disp
+
+    def _build_cache(self):
+        trainseqs = sorted(os.listdir(self.root + "train/balanced"))
+        trainpairs = [
+            "train/balanced/" + s + "/camera_00/" + imname
+            for s in trainseqs
+            for imname in sorted(
+                os.listdir(self.root + "train/balanced/" + s + "/camera_00/")
+            )
+        ]
+        testpairs = [
+            "test/balanced/" + s + "/camera_00/" + imname
+            for s in sorted(os.listdir(self.root + "test/balanced"))
+            for imname in sorted(
+                os.listdir(self.root + "test/balanced/" + s + "/camera_00/")
+            )
+        ]
+        assert len(trainpairs) == 228 and len(testpairs) == 191
+        subtrainpairs = [p for p in trainpairs if any(s in p for s in trainseqs[:-2])]
+        subvalpairs = [p for p in trainpairs if any(s in p for s in trainseqs[-2:])]
+        # warning: if we do validation split, we should split scenes!!!
+        tosave = {
+            "train_balanced": trainpairs,
+            "test_balanced": testpairs,
+            "subtrain_balanced": subtrainpairs,
+            "subval_balanced": subvalpairs,
+        }
+        return tosave
+
+
+class SpringDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Spring"
+        self._set_root()
+        assert self.split in ["train", "test", "subtrain", "subval"]
+        self.pairname_to_Limgname = lambda pairname: osp.join(
+            self.root, pairname + ".png"
+        )
+        self.pairname_to_Rimgname = (
+            lambda pairname: osp.join(self.root, pairname + ".png")
+            .replace("frame_right", "<frame_right>")
+            .replace("frame_left", "frame_right")
+            .replace("<frame_right>", "frame_left")
+        )
+        self.pairname_to_Ldispname = lambda pairname: (
+            None
+            if pairname.startswith("test")
+            else osp.join(self.root, pairname + ".dsp5")
+            .replace("frame_left", "disp1_left")
+            .replace("frame_right", "disp1_right")
+        )
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_disparity = _read_hdf5_disp
+
+    def _build_cache(self):
+        trainseqs = sorted(os.listdir(osp.join(self.root, "train")))
+        trainpairs = [
+            osp.join("train", s, "frame_left", f[:-4])
+            for s in trainseqs
+            for f in sorted(os.listdir(osp.join(self.root, "train", s, "frame_left")))
+        ]
+        testseqs = sorted(os.listdir(osp.join(self.root, "test")))
+        testpairs = [
+            osp.join("test", s, "frame_left", f[:-4])
+            for s in testseqs
+            for f in sorted(os.listdir(osp.join(self.root, "test", s, "frame_left")))
+        ]
+        testpairs += [p.replace("frame_left", "frame_right") for p in testpairs]
+        """maxnorm = {'0001': 32.88, '0002': 228.5, '0004': 298.2, '0005': 142.5, '0006': 113.6, '0007': 27.3, '0008': 554.5, '0009': 155.6, '0010': 126.1, '0011': 87.6, '0012': 303.2, '0013': 24.14, '0014': 82.56, '0015': 98.44, '0016': 156.9, '0017': 28.17, '0018': 21.03, '0020': 178.0, '0021': 58.06, '0022': 354.2, '0023': 8.79, '0024': 97.06, '0025': 55.16, '0026': 91.9, '0027': 156.6, '0030': 200.4, '0032': 58.66, '0033': 373.5, '0036': 149.4, '0037': 5.625, '0038': 37.0, '0039': 12.2, '0041': 453.5, '0043': 457.0, '0044': 379.5, '0045': 161.8, '0047': 105.44} # => let'use 0041"""
+        subtrainpairs = [p for p in trainpairs if p.split("/")[1] != "0041"]
+        subvalpairs = [p for p in trainpairs if p.split("/")[1] == "0041"]
+        assert (
+            len(trainpairs) == 5000
+            and len(testpairs) == 2000
+            and len(subtrainpairs) == 4904
+            and len(subvalpairs) == 96
+        ), "incorrect parsing of pairs in Spring"
+        tosave = {
+            "train": trainpairs,
+            "test": testpairs,
+            "subtrain": subtrainpairs,
+            "subval": subvalpairs,
+        }
+        return tosave
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim == 2
+        assert prediction.dtype == np.float32
+        outfile = (
+            os.path.join(outdir, pairname + ".dsp5")
+            .replace("frame_left", "disp1_left")
+            .replace("frame_right", "disp1_right")
+        )
+        os.makedirs(os.path.dirname(outfile), exist_ok=True)
+        writeDsp5File(prediction, outfile)
+
+    def finalize_submission(self, outdir):
+        assert self.split == "test"
+        exe = "{self.root}/disp1_subsampling"
+        if os.path.isfile(exe):
+            cmd = f'cd "{outdir}/test"; {exe} .'
+            print(cmd)
+            os.system(cmd)
+        else:
+            print("Could not find disp1_subsampling executable for submission.")
+            print("Please download it and run:")
+            print(f'cd "{outdir}/test"; <disp1_subsampling_exe> .')
+
+
+class Kitti12Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Kitti12"
+        self._set_root()
+        assert self.split in ["train", "test"]
+        self.pairname_to_Limgname = lambda pairname: osp.join(
+            self.root, pairname + "_10.png"
+        )
+        self.pairname_to_Rimgname = lambda pairname: osp.join(
+            self.root, pairname.replace("/colored_0/", "/colored_1/") + "_10.png"
+        )
+        self.pairname_to_Ldispname = (
+            None
+            if self.split == "test"
+            else lambda pairname: osp.join(
+                self.root, pairname.replace("/colored_0/", "/disp_occ/") + "_10.png"
+            )
+        )
+        self.pairname_to_str = lambda pairname: pairname.replace("/colored_0/", "/")
+        self.load_disparity = _read_kitti_disp
+
+    def _build_cache(self):
+        trainseqs = ["training/colored_0/%06d" % (i) for i in range(194)]
+        testseqs = ["testing/colored_0/%06d" % (i) for i in range(195)]
+        assert (
+            len(trainseqs) == 194 and len(testseqs) == 195
+        ), "incorrect parsing of pairs in Kitti12"
+        tosave = {"train": trainseqs, "test": testseqs}
+        return tosave
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim == 2
+        assert prediction.dtype == np.float32
+        outfile = os.path.join(outdir, pairname.split("/")[-1] + "_10.png")
+        os.makedirs(os.path.dirname(outfile), exist_ok=True)
+        img = (prediction * 256).astype("uint16")
+        Image.fromarray(img).save(outfile)
+
+    def finalize_submission(self, outdir):
+        assert self.split == "test"
+        cmd = f'cd {outdir}/; zip -r "kitti12_results.zip" .'
+        print(cmd)
+        os.system(cmd)
+        print(f"Done. Submission file at {outdir}/kitti12_results.zip")
+
+
+class Kitti15Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Kitti15"
+        self._set_root()
+        assert self.split in ["train", "subtrain", "subval", "test"]
+        self.pairname_to_Limgname = lambda pairname: osp.join(
+            self.root, pairname + "_10.png"
+        )
+        self.pairname_to_Rimgname = lambda pairname: osp.join(
+            self.root, pairname.replace("/image_2/", "/image_3/") + "_10.png"
+        )
+        self.pairname_to_Ldispname = (
+            None
+            if self.split == "test"
+            else lambda pairname: osp.join(
+                self.root, pairname.replace("/image_2/", "/disp_occ_0/") + "_10.png"
+            )
+        )
+        self.pairname_to_str = lambda pairname: pairname.replace("/image_2/", "/")
+        self.load_disparity = _read_kitti_disp
+
+    def _build_cache(self):
+        trainseqs = ["training/image_2/%06d" % (i) for i in range(200)]
+        subtrainseqs = trainseqs[:-5]
+        subvalseqs = trainseqs[-5:]
+        testseqs = ["testing/image_2/%06d" % (i) for i in range(200)]
+        assert (
+            len(trainseqs) == 200
+            and len(subtrainseqs) == 195
+            and len(subvalseqs) == 5
+            and len(testseqs) == 200
+        ), "incorrect parsing of pairs in Kitti15"
+        tosave = {
+            "train": trainseqs,
+            "subtrain": subtrainseqs,
+            "subval": subvalseqs,
+            "test": testseqs,
+        }
+        return tosave
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim == 2
+        assert prediction.dtype == np.float32
+        outfile = os.path.join(outdir, "disp_0", pairname.split("/")[-1] + "_10.png")
+        os.makedirs(os.path.dirname(outfile), exist_ok=True)
+        img = (prediction * 256).astype("uint16")
+        Image.fromarray(img).save(outfile)
+
+    def finalize_submission(self, outdir):
+        assert self.split == "test"
+        cmd = f'cd {outdir}/; zip -r "kitti15_results.zip" disp_0'
+        print(cmd)
+        os.system(cmd)
+        print(f"Done. Submission file at {outdir}/kitti15_results.zip")
+
+
+### auxiliary functions
+
+
+def _read_img(filename):
+    # convert to RGB for scene flow finalpass data
+    img = np.asarray(Image.open(filename).convert("RGB"))
+    return img
+
+
+def _read_booster_disp(filename):
+    disp = np.load(filename)
+    disp[disp == 0.0] = np.inf
+    return disp
+
+
+def _read_png_disp(filename, coef=1.0):
+    disp = np.asarray(Image.open(filename))
+    disp = disp.astype(np.float32) / coef
+    disp[disp == 0.0] = np.inf
+    return disp
+
+
+def _read_pfm_disp(filename):
+    disp = np.ascontiguousarray(_read_pfm(filename)[0])
+    disp[disp <= 0] = (
+        np.inf
+    )  # eg /nfs/data/ffs-3d/datasets/middlebury/2014/Shopvac-imperfect/disp0.pfm
+    return disp
+
+
+def _read_npy_disp(filename):
+    return np.load(filename)
+
+
+def _read_crestereo_disp(filename):
+    return _read_png_disp(filename, coef=32.0)
+
+
+def _read_middlebury20052006_disp(filename):
+    return _read_png_disp(filename, coef=1.0)
+
+
+def _read_kitti_disp(filename):
+    return _read_png_disp(filename, coef=256.0)
+
+
+_read_sceneflow_disp = _read_pfm_disp
+_read_eth3d_disp = _read_pfm_disp
+_read_middlebury_disp = _read_pfm_disp
+_read_carla_disp = _read_pfm_disp
+_read_tartanair_disp = _read_npy_disp
+
+
+def _read_hdf5_disp(filename):
+    disp = np.asarray(h5py.File(filename)["disparity"])
+    disp[np.isnan(disp)] = np.inf  # make invalid values as +inf
+    # disp[disp==0.0] = np.inf # make invalid values as +inf
+    return disp.astype(np.float32)
+
+
+import re
+
+
+def _read_pfm(file):
+    file = open(file, "rb")
+
+    color = None
+    width = None
+    height = None
+    scale = None
+    endian = None
+
+    header = file.readline().rstrip()
+    if header.decode("ascii") == "PF":
+        color = True
+    elif header.decode("ascii") == "Pf":
+        color = False
+    else:
+        raise Exception("Not a PFM file.")
+
+    dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
+    if dim_match:
+        width, height = list(map(int, dim_match.groups()))
+    else:
+        raise Exception("Malformed PFM header.")
+
+    scale = float(file.readline().decode("ascii").rstrip())
+    if scale < 0:  # little-endian
+        endian = "<"
+        scale = -scale
+    else:
+        endian = ">"  # big-endian
+
+    data = np.fromfile(file, endian + "f")
+    shape = (height, width, 3) if color else (height, width)
+
+    data = np.reshape(data, shape)
+    data = np.flipud(data)
+    return data, scale
+
+
+def writePFM(file, image, scale=1):
+    file = open(file, "wb")
+
+    color = None
+
+    if image.dtype.name != "float32":
+        raise Exception("Image dtype must be float32.")
+
+    image = np.flipud(image)
+
+    if len(image.shape) == 3 and image.shape[2] == 3:  # color image
+        color = True
+    elif (
+        len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
+    ):  # greyscale
+        color = False
+    else:
+        raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
+
+    file.write("PF\n" if color else "Pf\n".encode())
+    file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
+
+    endian = image.dtype.byteorder
+
+    if endian == "<" or endian == "=" and sys.byteorder == "little":
+        scale = -scale
+
+    file.write("%f\n".encode() % scale)
+
+    image.tofile(file)
+
+
+def writeDsp5File(disp, filename):
+    with h5py.File(filename, "w") as f:
+        f.create_dataset("disparity", data=disp, compression="gzip", compression_opts=5)
+
+
+# disp visualization
+
+
+def vis_disparity(disp, m=None, M=None):
+    if m is None:
+        m = disp.min()
+    if M is None:
+        M = disp.max()
+    disp_vis = (disp - m) / (M - m) * 255.0
+    disp_vis = disp_vis.astype("uint8")
+    disp_vis = cv2.applyColorMap(disp_vis, cv2.COLORMAP_INFERNO)
+    return disp_vis
+
+
+# dataset getter
+
+
+def get_train_dataset_stereo(dataset_str, augmentor=True, crop_size=None):
+    dataset_str = dataset_str.replace("(", "Dataset(")
+    if augmentor:
+        dataset_str = dataset_str.replace(")", ", augmentor=True)")
+    if crop_size is not None:
+        dataset_str = dataset_str.replace(
+            ")", ", crop_size={:s})".format(str(crop_size))
+        )
+    return eval(dataset_str)
+
+
+def get_test_datasets_stereo(dataset_str):
+    dataset_str = dataset_str.replace("(", "Dataset(")
+    return [eval(s) for s in dataset_str.split("+")]
diff --git a/extern/CUT3R/src/croco/stereoflow/download_model.sh b/extern/CUT3R/src/croco/stereoflow/download_model.sh
new file mode 100644
index 0000000000000000000000000000000000000000..533119609108c5ec3c22ff79b10e9215c1ac5098
--- /dev/null
+++ b/extern/CUT3R/src/croco/stereoflow/download_model.sh
@@ -0,0 +1,12 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+model=$1
+outfile="stereoflow_models/${model}"
+if [[ ! -f $outfile ]]
+then
+	mkdir -p stereoflow_models/;
+	wget https://download.europe.naverlabs.com/ComputerVision/CroCo/StereoFlow_models/$1 -P stereoflow_models/;
+else
+	echo "Model ${model} already downloaded in ${outfile}."
+fi
\ No newline at end of file
diff --git a/extern/CUT3R/src/croco/stereoflow/engine.py b/extern/CUT3R/src/croco/stereoflow/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..9736f2ab7c895e032893f60949baf87131a49b6e
--- /dev/null
+++ b/extern/CUT3R/src/croco/stereoflow/engine.py
@@ -0,0 +1,367 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Main function for training one epoch or testing
+# --------------------------------------------------------
+
+import math
+import sys
+from typing import Iterable
+import numpy as np
+import torch
+import torchvision
+
+from utils import misc as misc
+
+
+def split_prediction_conf(predictions, with_conf=False):
+    if not with_conf:
+        return predictions, None
+    conf = predictions[:, -1:, :, :]
+    predictions = predictions[:, :-1, :, :]
+    return predictions, conf
+
+
+def train_one_epoch(
+    model: torch.nn.Module,
+    criterion: torch.nn.Module,
+    metrics: torch.nn.Module,
+    data_loader: Iterable,
+    optimizer: torch.optim.Optimizer,
+    device: torch.device,
+    epoch: int,
+    loss_scaler,
+    log_writer=None,
+    print_freq=20,
+    args=None,
+):
+    model.train(True)
+    metric_logger = misc.MetricLogger(delimiter="  ")
+    metric_logger.add_meter("lr", misc.SmoothedValue(window_size=1, fmt="{value:.6f}"))
+    header = "Epoch: [{}]".format(epoch)
+
+    accum_iter = args.accum_iter
+
+    optimizer.zero_grad()
+
+    details = {}
+
+    if log_writer is not None:
+        print("log_dir: {}".format(log_writer.log_dir))
+
+    if args.img_per_epoch:
+        iter_per_epoch = args.img_per_epoch // args.batch_size + int(
+            args.img_per_epoch % args.batch_size > 0
+        )
+        assert (
+            len(data_loader) >= iter_per_epoch
+        ), "Dataset is too small for so many iterations"
+        len_data_loader = iter_per_epoch
+    else:
+        len_data_loader, iter_per_epoch = len(data_loader), None
+
+    for data_iter_step, (image1, image2, gt, pairname) in enumerate(
+        metric_logger.log_every(
+            data_loader, print_freq, header, max_iter=iter_per_epoch
+        )
+    ):
+
+        image1 = image1.to(device, non_blocking=True)
+        image2 = image2.to(device, non_blocking=True)
+        gt = gt.to(device, non_blocking=True)
+
+        # we use a per iteration (instead of per epoch) lr scheduler
+        if data_iter_step % accum_iter == 0:
+            misc.adjust_learning_rate(
+                optimizer, data_iter_step / len_data_loader + epoch, args
+            )
+
+        with torch.cuda.amp.autocast(enabled=bool(args.amp)):
+            prediction = model(image1, image2)
+            prediction, conf = split_prediction_conf(prediction, criterion.with_conf)
+            batch_metrics = metrics(prediction.detach(), gt)
+            loss = (
+                criterion(prediction, gt)
+                if conf is None
+                else criterion(prediction, gt, conf)
+            )
+
+        loss_value = loss.item()
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            sys.exit(1)
+
+        loss /= accum_iter
+        loss_scaler(
+            loss,
+            optimizer,
+            parameters=model.parameters(),
+            update_grad=(data_iter_step + 1) % accum_iter == 0,
+        )
+        if (data_iter_step + 1) % accum_iter == 0:
+            optimizer.zero_grad()
+
+        torch.cuda.synchronize()
+
+        metric_logger.update(loss=loss_value)
+        for k, v in batch_metrics.items():
+            metric_logger.update(**{k: v.item()})
+        lr = optimizer.param_groups[0]["lr"]
+        metric_logger.update(lr=lr)
+
+        # if args.dsitributed: loss_value_reduce = misc.all_reduce_mean(loss_value)
+        time_to_log = (data_iter_step + 1) % (
+            args.tboard_log_step * accum_iter
+        ) == 0 or data_iter_step == len_data_loader - 1
+        loss_value_reduce = misc.all_reduce_mean(loss_value)
+        if log_writer is not None and time_to_log:
+            epoch_1000x = int((data_iter_step / len_data_loader + epoch) * 1000)
+            # We use epoch_1000x as the x-axis in tensorboard. This calibrates different curves when batch size changes.
+            log_writer.add_scalar("train/loss", loss_value_reduce, epoch_1000x)
+            log_writer.add_scalar("lr", lr, epoch_1000x)
+            for k, v in batch_metrics.items():
+                log_writer.add_scalar("train/" + k, v.item(), epoch_1000x)
+
+    # gather the stats from all processes
+    # if args.distributed: metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+@torch.no_grad()
+def validate_one_epoch(
+    model: torch.nn.Module,
+    criterion: torch.nn.Module,
+    metrics: torch.nn.Module,
+    data_loaders: list[Iterable],
+    device: torch.device,
+    epoch: int,
+    log_writer=None,
+    args=None,
+):
+
+    model.eval()
+    metric_loggers = []
+    header = "Epoch: [{}]".format(epoch)
+    print_freq = 20
+
+    conf_mode = args.tile_conf_mode
+    crop = args.crop
+
+    if log_writer is not None:
+        print("log_dir: {}".format(log_writer.log_dir))
+
+    results = {}
+    dnames = []
+    image1, image2, gt, prediction = None, None, None, None
+    for didx, data_loader in enumerate(data_loaders):
+        dname = str(data_loader.dataset)
+        dnames.append(dname)
+        metric_loggers.append(misc.MetricLogger(delimiter="  "))
+        for data_iter_step, (image1, image2, gt, pairname) in enumerate(
+            metric_loggers[didx].log_every(data_loader, print_freq, header)
+        ):
+            image1 = image1.to(device, non_blocking=True)
+            image2 = image2.to(device, non_blocking=True)
+            gt = gt.to(device, non_blocking=True)
+            if dname.startswith("Spring"):
+                assert (
+                    gt.size(2) == image1.size(2) * 2
+                    and gt.size(3) == image1.size(3) * 2
+                )
+                gt = (
+                    gt[:, :, 0::2, 0::2]
+                    + gt[:, :, 0::2, 1::2]
+                    + gt[:, :, 1::2, 0::2]
+                    + gt[:, :, 1::2, 1::2]
+                ) / 4.0  # we approximate the gt based on the 2x upsampled ones
+
+            with torch.inference_mode():
+                prediction, tiled_loss, c = tiled_pred(
+                    model,
+                    criterion,
+                    image1,
+                    image2,
+                    gt,
+                    conf_mode=conf_mode,
+                    overlap=args.val_overlap,
+                    crop=crop,
+                    with_conf=criterion.with_conf,
+                )
+                batch_metrics = metrics(prediction.detach(), gt)
+                loss = (
+                    criterion(prediction.detach(), gt)
+                    if not criterion.with_conf
+                    else criterion(prediction.detach(), gt, c)
+                )
+                loss_value = loss.item()
+                metric_loggers[didx].update(loss_tiled=tiled_loss.item())
+                metric_loggers[didx].update(**{f"loss": loss_value})
+                for k, v in batch_metrics.items():
+                    metric_loggers[didx].update(**{dname + "_" + k: v.item()})
+
+    results = {
+        k: meter.global_avg for ml in metric_loggers for k, meter in ml.meters.items()
+    }
+    if len(dnames) > 1:
+        for k in batch_metrics.keys():
+            results["AVG_" + k] = sum(
+                results[dname + "_" + k] for dname in dnames
+            ) / len(dnames)
+
+    if log_writer is not None:
+        epoch_1000x = int((1 + epoch) * 1000)
+        for k, v in results.items():
+            log_writer.add_scalar("val/" + k, v, epoch_1000x)
+
+    print("Averaged stats:", results)
+    return results
+
+
+import torch.nn.functional as F
+
+
+def _resize_img(img, new_size):
+    return F.interpolate(img, size=new_size, mode="bicubic", align_corners=False)
+
+
+def _resize_stereo_or_flow(data, new_size):
+    assert data.ndim == 4
+    assert data.size(1) in [1, 2]
+    scale_x = new_size[1] / float(data.size(3))
+    out = F.interpolate(data, size=new_size, mode="bicubic", align_corners=False)
+    out[:, 0, :, :] *= scale_x
+    if out.size(1) == 2:
+        scale_y = new_size[0] / float(data.size(2))
+        out[:, 1, :, :] *= scale_y
+        print(scale_x, new_size, data.shape)
+    return out
+
+
+@torch.no_grad()
+def tiled_pred(
+    model,
+    criterion,
+    img1,
+    img2,
+    gt,
+    overlap=0.5,
+    bad_crop_thr=0.05,
+    downscale=False,
+    crop=512,
+    ret="loss",
+    conf_mode="conf_expsigmoid_10_5",
+    with_conf=False,
+    return_time=False,
+):
+
+    # for each image, we are going to run inference on many overlapping patches
+    # then, all predictions will be weighted-averaged
+    if gt is not None:
+        B, C, H, W = gt.shape
+    else:
+        B, _, H, W = img1.shape
+        C = model.head.num_channels - int(with_conf)
+    win_height, win_width = crop[0], crop[1]
+
+    # upscale to be larger than the crop
+    do_change_scale = H < win_height or W < win_width
+    if do_change_scale:
+        upscale_factor = max(win_width / W, win_height / W)
+        original_size = (H, W)
+        new_size = (round(H * upscale_factor), round(W * upscale_factor))
+        img1 = _resize_img(img1, new_size)
+        img2 = _resize_img(img2, new_size)
+        # resize gt just for the computation of tiled losses
+        if gt is not None:
+            gt = _resize_stereo_or_flow(gt, new_size)
+        H, W = img1.shape[2:4]
+
+    if conf_mode.startswith("conf_expsigmoid_"):  # conf_expsigmoid_30_10
+        beta, betasigmoid = map(float, conf_mode[len("conf_expsigmoid_") :].split("_"))
+    elif conf_mode.startswith("conf_expbeta"):  # conf_expbeta3
+        beta = float(conf_mode[len("conf_expbeta") :])
+    else:
+        raise NotImplementedError(f"conf_mode {conf_mode} is not implemented")
+
+    def crop_generator():
+        for sy in _overlapping(H, win_height, overlap):
+            for sx in _overlapping(W, win_width, overlap):
+                yield sy, sx, sy, sx, True
+
+    # keep track of weighted sum of prediction*weights and weights
+    accu_pred = img1.new_zeros(
+        (B, C, H, W)
+    )  # accumulate the weighted sum of predictions
+    accu_conf = img1.new_zeros((B, H, W)) + 1e-16  # accumulate the weights
+    accu_c = img1.new_zeros(
+        (B, H, W)
+    )  # accumulate the weighted sum of confidences ; not so useful except for computing some losses
+
+    tiled_losses = []
+
+    if return_time:
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+
+    for sy1, sx1, sy2, sx2, aligned in crop_generator():
+        # compute optical flow there
+        pred = model(_crop(img1, sy1, sx1), _crop(img2, sy2, sx2))
+        pred, predconf = split_prediction_conf(pred, with_conf=with_conf)
+
+        if gt is not None:
+            gtcrop = _crop(gt, sy1, sx1)
+        if criterion is not None and gt is not None:
+            tiled_losses.append(
+                criterion(pred, gtcrop).item()
+                if predconf is None
+                else criterion(pred, gtcrop, predconf).item()
+            )
+
+        if conf_mode.startswith("conf_expsigmoid_"):
+            conf = torch.exp(
+                -beta * 2 * (torch.sigmoid(predconf / betasigmoid) - 0.5)
+            ).view(B, win_height, win_width)
+        elif conf_mode.startswith("conf_expbeta"):
+            conf = torch.exp(-beta * predconf).view(B, win_height, win_width)
+        else:
+            raise NotImplementedError
+
+        accu_pred[..., sy1, sx1] += pred * conf[:, None, :, :]
+        accu_conf[..., sy1, sx1] += conf
+        accu_c[..., sy1, sx1] += predconf.view(B, win_height, win_width) * conf
+
+    pred = accu_pred / accu_conf[:, None, :, :]
+    c = accu_c / accu_conf
+    assert not torch.any(torch.isnan(pred))
+
+    if return_time:
+        end.record()
+        torch.cuda.synchronize()
+        time = start.elapsed_time(end) / 1000.0  # this was in milliseconds
+
+    if do_change_scale:
+        pred = _resize_stereo_or_flow(pred, original_size)
+
+    if return_time:
+        return pred, torch.mean(torch.tensor(tiled_losses)), c, time
+    return pred, torch.mean(torch.tensor(tiled_losses)), c
+
+
+def _overlapping(total, window, overlap=0.5):
+    assert total >= window and 0 <= overlap < 1, (total, window, overlap)
+    num_windows = 1 + int(np.ceil((total - window) / ((1 - overlap) * window)))
+    offsets = np.linspace(0, total - window, num_windows).round().astype(int)
+    yield from (slice(x, x + window) for x in offsets)
+
+
+def _crop(img, sy, sx):
+    B, THREE, H, W = img.shape
+    if 0 <= sy.start and sy.stop <= H and 0 <= sx.start and sx.stop <= W:
+        return img[:, :, sy, sx]
+    l, r = max(0, -sx.start), max(0, sx.stop - W)
+    t, b = max(0, -sy.start), max(0, sy.stop - H)
+    img = torch.nn.functional.pad(img, (l, r, t, b), mode="constant")
+    return img[:, :, slice(sy.start + t, sy.stop + t), slice(sx.start + l, sx.stop + l)]
diff --git a/extern/CUT3R/src/croco/stereoflow/test.py b/extern/CUT3R/src/croco/stereoflow/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..15dcf769169d460b716b05acb290340b6a197a6d
--- /dev/null
+++ b/extern/CUT3R/src/croco/stereoflow/test.py
@@ -0,0 +1,303 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Main test function
+# --------------------------------------------------------
+
+import os
+import argparse
+import pickle
+from PIL import Image
+import numpy as np
+from tqdm import tqdm
+
+import torch
+from torch.utils.data import DataLoader
+
+import utils.misc as misc
+from models.croco_downstream import CroCoDownstreamBinocular
+from models.head_downstream import PixelwiseTaskWithDPT
+
+from stereoflow.criterion import *
+from stereoflow.datasets_stereo import get_test_datasets_stereo
+from stereoflow.datasets_flow import get_test_datasets_flow
+from stereoflow.engine import tiled_pred
+
+from stereoflow.datasets_stereo import vis_disparity
+from stereoflow.datasets_flow import flowToColor
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser("Test CroCo models on stereo/flow", add_help=False)
+    # important argument
+    parser.add_argument(
+        "--model", required=True, type=str, help="Path to the model to evaluate"
+    )
+    parser.add_argument(
+        "--dataset",
+        required=True,
+        type=str,
+        help="test dataset (there can be multiple dataset separated by a +)",
+    )
+    # tiling
+    parser.add_argument(
+        "--tile_conf_mode",
+        type=str,
+        default="",
+        help="Weights for the tiling aggregation based on confidence (empty means use the formula from the loaded checkpoint",
+    )
+    parser.add_argument(
+        "--tile_overlap", type=float, default=0.7, help="overlap between tiles"
+    )
+    # save (it will automatically go to <model_path>_<dataset_str>/<tile_str>_<save>)
+    parser.add_argument(
+        "--save",
+        type=str,
+        nargs="+",
+        default=[],
+        help="what to save: \
+                              metrics (pickle file), \
+                              pred (raw prediction save as torch tensor), \
+                              visu (visualization in png of each prediction), \
+                              err10 (visualization in png of the error clamp at 10 for each prediction), \
+                              submission (submission file)",
+    )
+    # other (no impact)
+    parser.add_argument("--num_workers", default=4, type=int)
+    return parser
+
+
+def _load_model_and_criterion(model_path, do_load_metrics, device):
+    print("loading model from", model_path)
+    assert os.path.isfile(model_path)
+    ckpt = torch.load(model_path, "cpu")
+
+    ckpt_args = ckpt["args"]
+    task = ckpt_args.task
+    tile_conf_mode = ckpt_args.tile_conf_mode
+    num_channels = {"stereo": 1, "flow": 2}[task]
+    with_conf = eval(ckpt_args.criterion).with_conf
+    if with_conf:
+        num_channels += 1
+    print("head: PixelwiseTaskWithDPT()")
+    head = PixelwiseTaskWithDPT()
+    head.num_channels = num_channels
+    print("croco_args:", ckpt_args.croco_args)
+    model = CroCoDownstreamBinocular(head, **ckpt_args.croco_args)
+    msg = model.load_state_dict(ckpt["model"], strict=True)
+    model.eval()
+    model = model.to(device)
+
+    if do_load_metrics:
+        if task == "stereo":
+            metrics = StereoDatasetMetrics().to(device)
+        else:
+            metrics = FlowDatasetMetrics().to(device)
+    else:
+        metrics = None
+
+    return model, metrics, ckpt_args.crop, with_conf, task, tile_conf_mode
+
+
+def _save_batch(
+    pred, gt, pairnames, dataset, task, save, outdir, time, submission_dir=None
+):
+
+    for i in range(len(pairnames)):
+
+        pairname = (
+            eval(pairnames[i]) if pairnames[i].startswith("(") else pairnames[i]
+        )  # unbatch pairname
+        fname = os.path.join(outdir, dataset.pairname_to_str(pairname))
+        os.makedirs(os.path.dirname(fname), exist_ok=True)
+
+        predi = pred[i, ...]
+        if gt is not None:
+            gti = gt[i, ...]
+
+        if "pred" in save:
+            torch.save(predi.squeeze(0).cpu(), fname + "_pred.pth")
+
+        if "visu" in save:
+            if task == "stereo":
+                disparity = predi.permute((1, 2, 0)).squeeze(2).cpu().numpy()
+                m, M = None
+                if gt is not None:
+                    mask = torch.isfinite(gti)
+                    m = gt[mask].min()
+                    M = gt[mask].max()
+                img_disparity = vis_disparity(disparity, m=m, M=M)
+                Image.fromarray(img_disparity).save(fname + "_pred.png")
+            else:
+                # normalize flowToColor according to the maxnorm of gt (or prediction if not available)
+                flowNorm = (
+                    torch.sqrt(
+                        torch.sum((gti if gt is not None else predi) ** 2, dim=0)
+                    )
+                    .max()
+                    .item()
+                )
+                imgflow = flowToColor(
+                    predi.permute((1, 2, 0)).cpu().numpy(), maxflow=flowNorm
+                )
+                Image.fromarray(imgflow).save(fname + "_pred.png")
+
+        if "err10" in save:
+            assert gt is not None
+            L2err = torch.sqrt(torch.sum((gti - predi) ** 2, dim=0))
+            valid = torch.isfinite(gti[0, :, :])
+            L2err[~valid] = 0.0
+            L2err = torch.clamp(L2err, max=10.0)
+            red = (L2err * 255.0 / 10.0).to(dtype=torch.uint8)[:, :, None]
+            zer = torch.zeros_like(red)
+            imgerr = torch.cat((red, zer, zer), dim=2).cpu().numpy()
+            Image.fromarray(imgerr).save(fname + "_err10.png")
+
+        if "submission" in save:
+            assert submission_dir is not None
+            predi_np = (
+                predi.permute(1, 2, 0).squeeze(2).cpu().numpy()
+            )  # transform into HxWx2 for flow or HxW for stereo
+            dataset.submission_save_pairname(pairname, predi_np, submission_dir, time)
+
+
+def main(args):
+
+    # load the pretrained model and metrics
+    device = (
+        torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+    )
+    model, metrics, cropsize, with_conf, task, tile_conf_mode = (
+        _load_model_and_criterion(args.model, "metrics" in args.save, device)
+    )
+    if args.tile_conf_mode == "":
+        args.tile_conf_mode = tile_conf_mode
+
+    # load the datasets
+    datasets = (
+        get_test_datasets_stereo if task == "stereo" else get_test_datasets_flow
+    )(args.dataset)
+    dataloaders = [
+        DataLoader(
+            dataset,
+            batch_size=1,
+            shuffle=False,
+            num_workers=args.num_workers,
+            pin_memory=True,
+            drop_last=False,
+        )
+        for dataset in datasets
+    ]
+
+    # run
+    for i, dataloader in enumerate(dataloaders):
+        dataset = datasets[i]
+        dstr = args.dataset.split("+")[i]
+
+        outdir = args.model + "_" + misc.filename(dstr)
+        if "metrics" in args.save and len(args.save) == 1:
+            fname = os.path.join(
+                outdir, f"conf_{args.tile_conf_mode}_overlap_{args.tile_overlap}.pkl"
+            )
+            if os.path.isfile(fname) and len(args.save) == 1:
+                print("  metrics already compute in " + fname)
+                with open(fname, "rb") as fid:
+                    results = pickle.load(fid)
+                for k, v in results.items():
+                    print("{:s}: {:.3f}".format(k, v))
+                continue
+
+        if "submission" in args.save:
+            dirname = (
+                f"submission_conf_{args.tile_conf_mode}_overlap_{args.tile_overlap}"
+            )
+            submission_dir = os.path.join(outdir, dirname)
+        else:
+            submission_dir = None
+
+        print("")
+        print("saving {:s} in {:s}".format("+".join(args.save), outdir))
+        print(repr(dataset))
+
+        if metrics is not None:
+            metrics.reset()
+
+        for data_iter_step, (image1, image2, gt, pairnames) in enumerate(
+            tqdm(dataloader)
+        ):
+
+            do_flip = (
+                task == "stereo"
+                and dstr.startswith("Spring")
+                and any("right" in p for p in pairnames)
+            )  # we flip the images and will flip the prediction after as we assume img1 is on the left
+
+            image1 = image1.to(device, non_blocking=True)
+            image2 = image2.to(device, non_blocking=True)
+            gt = (
+                gt.to(device, non_blocking=True) if gt.numel() > 0 else None
+            )  # special case for test time
+            if do_flip:
+                assert all("right" in p for p in pairnames)
+                image1 = image1.flip(
+                    dims=[3]
+                )  # this is already the right frame, let's flip it
+                image2 = image2.flip(dims=[3])
+                gt = gt  # that is ok
+
+            with torch.inference_mode():
+                pred, _, _, time = tiled_pred(
+                    model,
+                    None,
+                    image1,
+                    image2,
+                    None if dataset.name == "Spring" else gt,
+                    conf_mode=args.tile_conf_mode,
+                    overlap=args.tile_overlap,
+                    crop=cropsize,
+                    with_conf=with_conf,
+                    return_time=True,
+                )
+
+                if do_flip:
+                    pred = pred.flip(dims=[3])
+
+                if metrics is not None:
+                    metrics.add_batch(pred, gt)
+
+                if any(k in args.save for k in ["pred", "visu", "err10", "submission"]):
+                    _save_batch(
+                        pred,
+                        gt,
+                        pairnames,
+                        dataset,
+                        task,
+                        args.save,
+                        outdir,
+                        time,
+                        submission_dir=submission_dir,
+                    )
+
+        # print
+        if metrics is not None:
+            results = metrics.get_results()
+            for k, v in results.items():
+                print("{:s}: {:.3f}".format(k, v))
+
+        # save if needed
+        if "metrics" in args.save:
+            os.makedirs(os.path.dirname(fname), exist_ok=True)
+            with open(fname, "wb") as fid:
+                pickle.dump(results, fid)
+            print("metrics saved in", fname)
+
+        # finalize submission if needed
+        if "submission" in args.save:
+            dataset.finalize_submission(submission_dir)
+
+
+if __name__ == "__main__":
+    args = get_args_parser()
+    args = args.parse_args()
+    main(args)
diff --git a/extern/CUT3R/src/croco/stereoflow/train.py b/extern/CUT3R/src/croco/stereoflow/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..c349cb479267648cad4d8b4c282dafd7a8896076
--- /dev/null
+++ b/extern/CUT3R/src/croco/stereoflow/train.py
@@ -0,0 +1,455 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Main training function
+# --------------------------------------------------------
+
+import argparse
+import datetime
+import json
+import numpy as np
+import os
+import sys
+import time
+
+import torch
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+from torch.utils.data import DataLoader
+
+import utils
+import utils.misc as misc
+from utils.misc import NativeScalerWithGradNormCount as NativeScaler
+from models.croco_downstream import CroCoDownstreamBinocular, croco_args_from_ckpt
+from models.pos_embed import interpolate_pos_embed
+from models.head_downstream import PixelwiseTaskWithDPT
+
+from stereoflow.datasets_stereo import (
+    get_train_dataset_stereo,
+    get_test_datasets_stereo,
+)
+from stereoflow.datasets_flow import get_train_dataset_flow, get_test_datasets_flow
+from stereoflow.engine import train_one_epoch, validate_one_epoch
+from stereoflow.criterion import *
+
+
+def get_args_parser():
+    # prepare subparsers
+    parser = argparse.ArgumentParser(
+        "Finetuning CroCo models on stereo or flow", add_help=False
+    )
+    subparsers = parser.add_subparsers(
+        title="Task (stereo or flow)", dest="task", required=True
+    )
+    parser_stereo = subparsers.add_parser("stereo", help="Training stereo model")
+    parser_flow = subparsers.add_parser("flow", help="Training flow model")
+
+    def add_arg(
+        name_or_flags, default=None, default_stereo=None, default_flow=None, **kwargs
+    ):
+        if default is not None:
+            assert (
+                default_stereo is None and default_flow is None
+            ), "setting default makes default_stereo and default_flow disabled"
+        parser_stereo.add_argument(
+            name_or_flags,
+            default=default if default is not None else default_stereo,
+            **kwargs,
+        )
+        parser_flow.add_argument(
+            name_or_flags,
+            default=default if default is not None else default_flow,
+            **kwargs,
+        )
+
+    # output dir
+    add_arg(
+        "--output_dir",
+        required=True,
+        type=str,
+        help="path where to save, if empty, automatically created",
+    )
+    # model
+    add_arg(
+        "--crop",
+        type=int,
+        nargs="+",
+        default_stereo=[352, 704],
+        default_flow=[320, 384],
+        help="size of the random image crops used during training.",
+    )
+    add_arg(
+        "--pretrained",
+        required=True,
+        type=str,
+        help="Load pretrained model (required as croco arguments come from there)",
+    )
+    # criterion
+    add_arg(
+        "--criterion",
+        default_stereo="LaplacianLossBounded2()",
+        default_flow="LaplacianLossBounded()",
+        type=str,
+        help="string to evaluate to get criterion",
+    )
+    add_arg("--bestmetric", default_stereo="avgerr", default_flow="EPE", type=str)
+    # dataset
+    add_arg("--dataset", type=str, required=True, help="training set")
+    # training
+    add_arg("--seed", default=0, type=int, help="seed")
+    add_arg(
+        "--batch_size",
+        default_stereo=6,
+        default_flow=8,
+        type=int,
+        help="Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus",
+    )
+    add_arg("--epochs", default=32, type=int, help="number of training epochs")
+    add_arg(
+        "--img_per_epoch",
+        type=int,
+        default=None,
+        help="Fix the number of images seen in an epoch (None means use all training pairs)",
+    )
+    add_arg(
+        "--accum_iter",
+        default=1,
+        type=int,
+        help="Accumulate gradient iterations (for increasing the effective batch size under memory constraints)",
+    )
+    add_arg(
+        "--weight_decay", type=float, default=0.05, help="weight decay (default: 0.05)"
+    )
+    add_arg(
+        "--lr",
+        type=float,
+        default_stereo=3e-5,
+        default_flow=2e-5,
+        metavar="LR",
+        help="learning rate (absolute lr)",
+    )
+    add_arg(
+        "--min_lr",
+        type=float,
+        default=0.0,
+        metavar="LR",
+        help="lower lr bound for cyclic schedulers that hit 0",
+    )
+    add_arg(
+        "--warmup_epochs", type=int, default=1, metavar="N", help="epochs to warmup LR"
+    )
+    add_arg(
+        "--optimizer",
+        default="AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))",
+        type=str,
+        help="Optimizer from torch.optim [ default: AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) ]",
+    )
+    add_arg(
+        "--amp",
+        default=0,
+        type=int,
+        choices=[0, 1],
+        help="enable automatic mixed precision training",
+    )
+    # validation
+    add_arg(
+        "--val_dataset",
+        type=str,
+        default="",
+        help="Validation sets, multiple separated by + (empty string means that no validation is performed)",
+    )
+    add_arg(
+        "--tile_conf_mode",
+        type=str,
+        default_stereo="conf_expsigmoid_15_3",
+        default_flow="conf_expsigmoid_10_5",
+        help="Weights for tile aggregation",
+    )
+    add_arg(
+        "--val_overlap", default=0.7, type=float, help="Overlap value for the tiling"
+    )
+    # others
+    add_arg("--num_workers", default=8, type=int)
+    add_arg("--eval_every", type=int, default=1, help="Val loss evaluation frequency")
+    add_arg("--save_every", type=int, default=1, help="Save checkpoint frequency")
+    add_arg(
+        "--start_from",
+        type=str,
+        default=None,
+        help="Start training using weights from an other model (eg for finetuning)",
+    )
+    add_arg(
+        "--tboard_log_step",
+        type=int,
+        default=100,
+        help="Log to tboard every so many steps",
+    )
+    add_arg(
+        "--dist_url", default="env://", help="url used to set up distributed training"
+    )
+
+    return parser
+
+
+def main(args):
+    misc.init_distributed_mode(args)
+    global_rank = misc.get_rank()
+    num_tasks = misc.get_world_size()
+
+    assert os.path.isfile(args.pretrained)
+    print("output_dir: " + args.output_dir)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # fix the seed for reproducibility
+    seed = args.seed + misc.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    cudnn.benchmark = True
+
+    # Metrics / criterion
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    metrics = (StereoMetrics if args.task == "stereo" else FlowMetrics)().to(device)
+    criterion = eval(args.criterion).to(device)
+    print("Criterion: ", args.criterion)
+
+    # Prepare model
+    assert os.path.isfile(args.pretrained)
+    ckpt = torch.load(args.pretrained, "cpu")
+    croco_args = croco_args_from_ckpt(ckpt)
+    croco_args["img_size"] = (args.crop[0], args.crop[1])
+    print("Croco args: " + str(croco_args))
+    args.croco_args = croco_args  # saved for test time
+    # prepare head
+    num_channels = {"stereo": 1, "flow": 2}[args.task]
+    if criterion.with_conf:
+        num_channels += 1
+    print(f"Building head PixelwiseTaskWithDPT() with {num_channels} channel(s)")
+    head = PixelwiseTaskWithDPT()
+    head.num_channels = num_channels
+    # build model and load pretrained weights
+    model = CroCoDownstreamBinocular(head, **croco_args)
+    interpolate_pos_embed(model, ckpt["model"])
+    msg = model.load_state_dict(ckpt["model"], strict=False)
+    print(msg)
+
+    total_params = sum(p.numel() for p in model.parameters())
+    total_params_trainable = sum(
+        p.numel() for p in model.parameters() if p.requires_grad
+    )
+    print(f"Total params: {total_params}")
+    print(f"Total params trainable: {total_params_trainable}")
+    model_without_ddp = model.to(device)
+
+    eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
+    print("lr: %.2e" % args.lr)
+    print("accumulate grad iterations: %d" % args.accum_iter)
+    print("effective batch size: %d" % eff_batch_size)
+
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.gpu], static_graph=True
+        )
+        model_without_ddp = model.module
+
+    # following timm: set wd as 0 for bias and norm layers
+    param_groups = misc.get_parameter_groups(model_without_ddp, args.weight_decay)
+    optimizer = eval(f"torch.optim.{args.optimizer}")
+    print(optimizer)
+    loss_scaler = NativeScaler()
+
+    # automatic restart
+    last_ckpt_fname = os.path.join(args.output_dir, f"checkpoint-last.pth")
+    args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None
+
+    if not args.resume and args.start_from:
+        print(f"Starting from an other model's weights: {args.start_from}")
+        best_so_far = None
+        args.start_epoch = 0
+        ckpt = torch.load(args.start_from, "cpu")
+        msg = model_without_ddp.load_state_dict(ckpt["model"], strict=False)
+        print(msg)
+    else:
+        best_so_far = misc.load_model(
+            args=args,
+            model_without_ddp=model_without_ddp,
+            optimizer=optimizer,
+            loss_scaler=loss_scaler,
+        )
+
+    if best_so_far is None:
+        best_so_far = np.inf
+
+    # tensorboard
+    log_writer = None
+    if global_rank == 0 and args.output_dir is not None:
+        log_writer = SummaryWriter(
+            log_dir=args.output_dir, purge_step=args.start_epoch * 1000
+        )
+
+    #  dataset and loader
+    print("Building Train Data loader for dataset: ", args.dataset)
+    train_dataset = (
+        get_train_dataset_stereo if args.task == "stereo" else get_train_dataset_flow
+    )(args.dataset, crop_size=args.crop)
+
+    def _print_repr_dataset(d):
+        if isinstance(d, torch.utils.data.dataset.ConcatDataset):
+            for dd in d.datasets:
+                _print_repr_dataset(dd)
+        else:
+            print(repr(d))
+
+    _print_repr_dataset(train_dataset)
+    print("  total length:", len(train_dataset))
+    if args.distributed:
+        sampler_train = torch.utils.data.DistributedSampler(
+            train_dataset, num_replicas=num_tasks, rank=global_rank, shuffle=True
+        )
+    else:
+        sampler_train = torch.utils.data.RandomSampler(train_dataset)
+    data_loader_train = torch.utils.data.DataLoader(
+        train_dataset,
+        sampler=sampler_train,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=True,
+    )
+    if args.val_dataset == "":
+        data_loaders_val = None
+    else:
+        print("Building Val Data loader for datasets: ", args.val_dataset)
+        val_datasets = (
+            get_test_datasets_stereo
+            if args.task == "stereo"
+            else get_test_datasets_flow
+        )(args.val_dataset)
+        for val_dataset in val_datasets:
+            print(repr(val_dataset))
+        data_loaders_val = [
+            DataLoader(
+                val_dataset,
+                batch_size=1,
+                shuffle=False,
+                num_workers=args.num_workers,
+                pin_memory=True,
+                drop_last=False,
+            )
+            for val_dataset in val_datasets
+        ]
+        bestmetric = (
+            "AVG_"
+            if len(data_loaders_val) > 1
+            else str(data_loaders_val[0].dataset) + "_"
+        ) + args.bestmetric
+
+    print(f"Start training for {args.epochs} epochs")
+    start_time = time.time()
+    # Training Loop
+    for epoch in range(args.start_epoch, args.epochs):
+
+        if args.distributed:
+            data_loader_train.sampler.set_epoch(epoch)
+
+        # Train
+        epoch_start = time.time()
+        train_stats = train_one_epoch(
+            model,
+            criterion,
+            metrics,
+            data_loader_train,
+            optimizer,
+            device,
+            epoch,
+            loss_scaler,
+            log_writer=log_writer,
+            args=args,
+        )
+        epoch_time = time.time() - epoch_start
+
+        if args.distributed:
+            dist.barrier()
+
+        # Validation (current naive implementation runs the validation on every gpu ... not smart ...)
+        if (
+            data_loaders_val is not None
+            and args.eval_every > 0
+            and (epoch + 1) % args.eval_every == 0
+        ):
+            val_epoch_start = time.time()
+            val_stats = validate_one_epoch(
+                model,
+                criterion,
+                metrics,
+                data_loaders_val,
+                device,
+                epoch,
+                log_writer=log_writer,
+                args=args,
+            )
+            val_epoch_time = time.time() - val_epoch_start
+
+            val_best = val_stats[bestmetric]
+
+            # Save best of all
+            if val_best <= best_so_far:
+                best_so_far = val_best
+                misc.save_model(
+                    args=args,
+                    model_without_ddp=model_without_ddp,
+                    optimizer=optimizer,
+                    loss_scaler=loss_scaler,
+                    epoch=epoch,
+                    best_so_far=best_so_far,
+                    fname="best",
+                )
+
+            log_stats = {
+                **{f"train_{k}": v for k, v in train_stats.items()},
+                "epoch": epoch,
+                **{f"val_{k}": v for k, v in val_stats.items()},
+            }
+        else:
+            log_stats = {
+                **{f"train_{k}": v for k, v in train_stats.items()},
+                "epoch": epoch,
+            }
+
+        if args.distributed:
+            dist.barrier()
+
+        # Save stuff
+        if args.output_dir and (
+            (epoch + 1) % args.save_every == 0 or epoch + 1 == args.epochs
+        ):
+            misc.save_model(
+                args=args,
+                model_without_ddp=model_without_ddp,
+                optimizer=optimizer,
+                loss_scaler=loss_scaler,
+                epoch=epoch,
+                best_so_far=best_so_far,
+                fname="last",
+            )
+
+        if args.output_dir:
+            if log_writer is not None:
+                log_writer.flush()
+            with open(
+                os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8"
+            ) as f:
+                f.write(json.dumps(log_stats) + "\n")
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print("Training time {}".format(total_time_str))
+
+
+if __name__ == "__main__":
+    args = get_args_parser()
+    args = args.parse_args()
+    main(args)
diff --git a/extern/CUT3R/src/croco/utils/misc.py b/extern/CUT3R/src/croco/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0550de7bfd7308c2e9a738bbb76cab08a1a8ef7
--- /dev/null
+++ b/extern/CUT3R/src/croco/utils/misc.py
@@ -0,0 +1,600 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions for CroCo
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+
+import builtins
+import datetime
+import os
+import time
+import math
+import json
+from collections import defaultdict, deque
+from pathlib import Path
+import numpy as np
+
+import torch
+import torch.distributed as dist
+from torch import inf
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+
+printer = get_logger(__name__, log_level="DEBUG")
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values."""
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self, accelerator: Accelerator):
+        """Synchronize the count and total across all processes."""
+        if accelerator.num_processes == 1:
+            return
+        t = torch.tensor(
+            [self.count, self.total], dtype=torch.float64, device=accelerator.device
+        )
+        accelerator.wait_for_everyone()
+        accelerator.reduce(t, reduction="sum")
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        return torch.tensor(list(self.deque)).median().item()
+
+    @property
+    def avg(self):
+        return torch.tensor(list(self.deque), dtype=torch.float32).mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value,
+        )
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                if v.ndim > 0:
+                    continue
+                v = v.item()
+            if isinstance(v, list):
+                continue
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError(
+            "'{}' object has no attribute '{}'".format(type(self).__name__, attr)
+        )
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {}".format(name, str(meter)))
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self, accelerator):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes(accelerator)
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(
+        self, iterable, print_freq, accelerator: Accelerator, header=None, max_iter=None
+    ):
+        i = 0
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.4f}")
+        data_time = SmoothedValue(fmt="{avg:.4f}")
+        len_iterable = min(len(iterable), max_iter) if max_iter else len(iterable)
+        space_fmt = ":" + str(len(str(len_iterable))) + "d"
+        log_msg = [
+            header,
+            "[{0" + space_fmt + "}/{1}]",
+            "eta: {eta}",
+            "{meters}",
+            "time: {time}",
+            "data: {data}",
+        ]
+        if torch.cuda.is_available():
+            log_msg.append("max mem: {memory:.0f}")
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for it, obj in enumerate(iterable):
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len_iterable - 1:
+                eta_seconds = iter_time.global_avg * (len_iterable - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    if accelerator.is_main_process:
+                        printer.info(
+                            log_msg.format(
+                                i,
+                                len_iterable,
+                                eta=eta_string,
+                                meters=str(self),
+                                time=str(iter_time),
+                                data=str(data_time),
+                                memory=torch.cuda.max_memory_allocated() / MB,
+                            )
+                        )
+                else:
+                    if accelerator.is_main_process:
+                        printer.info(
+                            log_msg.format(
+                                i,
+                                len_iterable,
+                                eta=eta_string,
+                                meters=str(self),
+                                time=str(iter_time),
+                                data=str(data_time),
+                            )
+                        )
+            i += 1
+            end = time.time()
+            if max_iter and it >= max_iter:
+                break
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        if accelerator.is_main_process:
+            printer.info(
+                "{} Total time: {} ({:.4f} s / it)".format(
+                    header, total_time_str, total_time / len_iterable
+                )
+            )
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    builtin_print = builtins.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        force = force or (get_world_size() > 8)
+        if is_master or force:
+            now = datetime.datetime.now().time()
+            builtin_print("[{}] ".format(now), end="")  # print with time stamp
+            builtin_print(*args, **kwargs)
+
+    builtins.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process(accelerator: Accelerator):
+    return accelerator.is_main_process
+
+
+def save_on_master(accelerator: Accelerator, *args, **kwargs):
+    if is_main_process(accelerator):
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    nodist = args.nodist if hasattr(args, "nodist") else False
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ and not nodist:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.gpu = int(os.environ["LOCAL_RANK"])
+    else:
+        print("Not using distributed mode")
+        setup_for_distributed(is_master=True)  # hack
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = "nccl"
+    print(
+        "| distributed init (rank {}): {}, gpu {}".format(
+            args.rank, args.dist_url, args.gpu
+        ),
+        flush=True,
+    )
+    torch.distributed.init_process_group(
+        backend=args.dist_backend,
+        init_method=args.dist_url,
+        world_size=args.world_size,
+        rank=args.rank,
+    )
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+
+
+class NativeScalerWithGradNormCount:
+    state_dict_key = "amp_scaler"
+
+    def __init__(self, enabled=True, accelerator: Accelerator = None):
+        self.accelerator = accelerator
+
+    def __call__(
+        self,
+        loss,
+        optimizer,
+        clip_grad=None,
+        parameters=None,
+        create_graph=False,
+        update_grad=True,
+    ):
+        self.accelerator.backward(
+            loss, create_graph=create_graph
+        )  # .backward(create_graph=create_graph)
+        if update_grad:
+            if clip_grad is not None:
+                assert parameters is not None
+                # self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
+                norm = self.accelerator.clip_grad_norm_(parameters, clip_grad)
+            else:
+                if self.accelerator.scaler is not None:
+                    self.accelerator.unscale_gradients()
+                norm = get_grad_norm_(parameters)
+            optimizer.step()
+        else:
+            norm = None
+        return norm
+
+    def state_dict(self):
+        if self.accelerator.scaler is not None:
+            return self.accelerator.scaler.state_dict()
+        else:
+            return {}
+
+    def load_state_dict(self, state_dict):
+        if self.accelerator.scaler is not None:
+            self.accelerator.scaler.load_state_dict(state_dict)
+
+
+# class NativeScalerWithGradNormCount:
+#     state_dict_key = "amp_scaler"
+
+#     def __init__(self, enabled=True, accelerator:Accelerator=None):
+#         self._scaler = torch.cuda.amp.GradScaler(enabled=enabled)
+#         self.accelerator = accelerator
+
+#     def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True):
+#         # self.accelerator.backward(loss, create_graph=create_graph) #.backward(create_graph=create_graph)
+#         self._scaler.scale(loss).backward(create_graph=create_graph)
+#         if update_grad:
+#             if clip_grad is not None:
+#                 assert parameters is not None
+#                 # #self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
+#                 # norm = self.accelerator.clip_grad_norm_(parameters, clip_grad)
+#                 self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
+#                 norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
+#             else:
+#                 # if self.accelerator.scaler is not None:
+#                 #     self.accelerator.unscale_gradients()
+#                 # norm = get_grad_norm_(parameters)
+#                 self._scaler.unscale_(optimizer)
+#                 norm = get_grad_norm_(parameters)
+#             # optimizer.step()
+#             self._scaler.step(optimizer)
+#             self._scaler.update()
+#         else:
+#             norm = None
+#         return norm
+
+#     # def state_dict(self):
+#     #     if self.accelerator.scaler is not None:
+#     #         return self.accelerator.scaler.state_dict()
+#     #     else:
+#     #         return {}
+
+#     # def load_state_dict(self, state_dict):
+#     #     if self.accelerator.scaler is not None:
+#     #         self.accelerator.scaler.load_state_dict(state_dict)
+
+#     def state_dict(self):
+#         return self._scaler.state_dict()
+
+#     def load_state_dict(self, state_dict):
+#         self._scaler.load_state_dict(state_dict)
+
+
+def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = [p for p in parameters if p.grad is not None]
+    norm_type = float(norm_type)
+    if len(parameters) == 0:
+        return torch.tensor(0.0)
+    device = parameters[0].grad.device
+    if norm_type == inf:
+        total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters)
+    else:
+        total_norm = torch.norm(
+            torch.stack(
+                [torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]
+            ),
+            norm_type,
+        )
+    return total_norm
+
+
+def save_model(
+    accelerator,
+    args,
+    epoch,
+    model_without_ddp,
+    optimizer,
+    loss_scaler,
+    fname=None,
+    best_so_far=None,
+):
+    if accelerator.is_main_process:
+        output_dir = Path(args.output_dir)
+        if fname is None:
+            fname = str(epoch)
+        checkpoint_path = output_dir / ("checkpoint-%s.pth" % fname)
+        to_save = {
+            "model": model_without_ddp.state_dict(),
+            "optimizer": optimizer.state_dict(),
+            "scaler": loss_scaler.state_dict(),
+            "args": args,
+            "epoch": epoch,
+        }
+        if best_so_far is not None:
+            to_save["best_so_far"] = best_so_far
+        print(f">> Saving model to {checkpoint_path} ...")
+        save_on_master(accelerator, to_save, checkpoint_path)
+
+
+def load_model(args, model_without_ddp, optimizer, loss_scaler):
+    args.start_epoch = 0
+    best_so_far = None
+    if args.resume is not None:
+        if args.resume.startswith("https"):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location="cpu", check_hash=True
+            )
+        else:
+            checkpoint = torch.load(args.resume, map_location="cpu")
+        printer.info("Resume checkpoint %s" % args.resume)
+        model_without_ddp.load_state_dict(checkpoint["model"], strict=False)
+        args.start_epoch = checkpoint["epoch"] + 1
+        optimizer.load_state_dict(checkpoint["optimizer"])
+        if "scaler" in checkpoint:
+            loss_scaler.load_state_dict(checkpoint["scaler"])
+        if "best_so_far" in checkpoint:
+            best_so_far = checkpoint["best_so_far"]
+            printer.info(" & best_so_far={:g}".format(best_so_far))
+        else:
+            printer.info("")
+        printer.info("With optim & sched! start_epoch={:d}".format(args.start_epoch))
+    return best_so_far
+
+
+def all_reduce_mean(x, accelerator):
+    """Use accelerator to all-reduce and compute mean."""
+    if accelerator.state.num_processes > 1:
+        x_reduce = torch.tensor(x).cuda()
+        accelerator.reduce(x_reduce, reduce_op="SUM")
+        x_reduce /= accelerator.state.num_processes
+        return x_reduce.item()
+    else:
+        return x
+
+
+def _replace(text, src, tgt, rm=""):
+    """Advanced string replacement.
+    Given a text:
+    - replace all elements in src by the corresponding element in tgt
+    - remove all elements in rm
+    """
+    if len(tgt) == 1:
+        tgt = tgt * len(src)
+    assert len(src) == len(tgt), f"'{src}' and '{tgt}' should have the same len"
+    for s, t in zip(src, tgt):
+        text = text.replace(s, t)
+    for c in rm:
+        text = text.replace(c, "")
+    return text
+
+
+def filename(obj):
+    """transform a python obj or cmd into a proper filename.
+    - \1 gets replaced by slash '/'
+    - \2 gets replaced by comma ','
+    """
+    if not isinstance(obj, str):
+        obj = repr(obj)
+    obj = str(obj).replace("()", "")
+    obj = _replace(obj, "_,(*/\1\2", "-__x%/,", rm=" )'\"")
+    assert all(len(s) < 256 for s in obj.split(os.sep)), (
+        "filename too long (>256 characters):\n" + obj
+    )
+    return obj
+
+
+def _get_num_layer_for_vit(var_name, enc_depth, dec_depth):
+    if var_name in ("cls_token", "mask_token", "pos_embed", "global_tokens"):
+        return 0
+    elif var_name.startswith("patch_embed"):
+        return 0
+    elif var_name.startswith("enc_blocks"):
+        layer_id = int(var_name.split(".")[1])
+        return layer_id + 1
+    elif var_name.startswith("decoder_embed") or var_name.startswith(
+        "enc_norm"
+    ):  # part of the last black
+        return enc_depth
+    elif var_name.startswith("dec_blocks"):
+        layer_id = int(var_name.split(".")[1])
+        return enc_depth + layer_id + 1
+    elif var_name.startswith("dec_norm"):  # part of the last block
+        return enc_depth + dec_depth
+    elif any(var_name.startswith(k) for k in ["head", "prediction_head"]):
+        return enc_depth + dec_depth + 1
+    else:
+        raise NotImplementedError(var_name)
+
+
+def get_parameter_groups(
+    model, weight_decay, layer_decay=1.0, skip_list=(), no_lr_scale_list=[]
+):
+    parameter_group_names = {}
+    parameter_group_vars = {}
+    enc_depth, dec_depth = None, None
+    # prepare layer decay values
+    assert layer_decay == 1.0 or 0.0 < layer_decay < 1.0
+    if layer_decay < 1.0:
+        enc_depth = model.enc_depth
+        dec_depth = model.dec_depth if hasattr(model, "dec_blocks") else 0
+        num_layers = enc_depth + dec_depth
+        layer_decay_values = list(
+            layer_decay ** (num_layers + 1 - i) for i in range(num_layers + 2)
+        )
+
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+
+        # Assign weight decay values
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
+            if "enc_blocks" in name:
+                group_name = "no_decay_enc_blocks"
+            else:
+                group_name = "no_decay"
+            this_weight_decay = 0.0
+        else:
+            if "enc_blocks" in name:
+                group_name = "decay_enc_blocks"
+            else:
+                group_name = "decay"
+            this_weight_decay = weight_decay
+
+        # Assign layer ID for LR scaling
+        if layer_decay < 1.0:
+            skip_scale = False
+            layer_id = _get_num_layer_for_vit(name, enc_depth, dec_depth)
+            group_name = "layer_%d_%s" % (layer_id, group_name)
+            if name in no_lr_scale_list:
+                skip_scale = True
+                group_name = f"{group_name}_no_lr_scale"
+        else:
+            layer_id = 0
+            skip_scale = True
+
+        if group_name not in parameter_group_names:
+            if not skip_scale:
+                scale = layer_decay_values[layer_id]
+            else:
+                scale = 1.0
+
+            if "enc_blocks" in group_name:
+                scale *= 1.0
+            parameter_group_names[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "lr_scale": scale,
+            }
+            parameter_group_vars[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "lr_scale": scale,
+            }
+
+        parameter_group_vars[group_name]["params"].append(param)
+        parameter_group_names[group_name]["params"].append(name)
+    printer.info("Param groups = %s" % json.dumps(parameter_group_names, indent=2))
+    return list(parameter_group_vars.values())
+
+
+def adjust_learning_rate(optimizer, epoch, args):
+    """Decay the learning rate with half-cycle cosine after warmup"""
+
+    if epoch < args.warmup_epochs:
+        lr = args.lr * epoch / args.warmup_epochs
+    else:
+        # lr = args.lr
+        lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * (
+            1.0
+            + math.cos(
+                math.pi
+                * (epoch - args.warmup_epochs)
+                / (args.epochs - args.warmup_epochs)
+            )
+        )
+
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr
+
+    return lr
diff --git a/extern/CUT3R/src/dust3r/__init__.py b/extern/CUT3R/src/dust3r/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/extern/CUT3R/src/dust3r/blocks.py b/extern/CUT3R/src/dust3r/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ee03a37aa5173a23969a1096208c8442a66a043
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/blocks.py
@@ -0,0 +1,531 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+import torch
+import torch.nn as nn
+
+from itertools import repeat
+import collections.abc
+from torch.nn.functional import scaled_dot_product_attention
+from functools import partial
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_2tuple = _ntuple(2)
+
+
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob,3):0.3f}"
+
+
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        bias=True,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        return self.drop2(self.fc2(self.drop1(self.act(self.fc1(x)))))
+
+
+class Attention(nn.Module):
+
+    def __init__(
+        self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope.float() if rope is not None else None
+
+    def forward(self, x, xpos):
+        B, N, C = x.shape
+
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .transpose(1, 3)
+        )
+        q, k, v = [qkv[:, :, i] for i in range(3)]
+
+        q_type = q.dtype
+        k_type = k.dtype
+        if self.rope is not None:
+            q = q.float()
+            k = k.float()
+            with torch.autocast(device_type="cuda", enabled=False):
+                q = self.rope(q, xpos)
+                k = self.rope(k, xpos)
+            q = q.to(q_type)
+            k = k.to(k_type)
+
+        x = (
+            scaled_dot_product_attention(
+                query=q, key=k, value=v, dropout_p=self.attn_drop.p, scale=self.scale
+            )
+            .transpose(1, 2)
+            .reshape(B, N, C)
+        )
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        rope=None,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            rope=rope,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+
+    def forward(self, x, xpos):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class CrossAttention(nn.Module):
+
+    def __init__(
+        self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.projq = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projk = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projv = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.rope = rope.float() if rope is not None else None
+
+    def forward(self, query, key, value, qpos, kpos):
+        B, Nq, C = query.shape
+        Nk = key.shape[1]
+        Nv = value.shape[1]
+
+        q = (
+            self.projq(query)
+            .reshape(B, Nq, self.num_heads, C // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+        k = (
+            self.projk(key)
+            .reshape(B, Nk, self.num_heads, C // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+        v = (
+            self.projv(value)
+            .reshape(B, Nv, self.num_heads, C // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+
+        q_type = q.dtype
+        k_type = k.dtype
+        if self.rope is not None:
+            if qpos is not None:
+                q = q.float()
+                with torch.autocast(device_type="cuda", enabled=False):
+                    q = self.rope(q, qpos)
+                q = q.to(q_type)
+
+            if kpos is not None:
+                k = k.float()
+                with torch.autocast(device_type="cuda", enabled=False):
+                    k = self.rope(k, kpos)
+                k = k.to(k_type)
+
+        x = (
+            scaled_dot_product_attention(
+                query=q, key=k, value=v, dropout_p=self.attn_drop.p, scale=self.scale
+            )
+            .transpose(1, 2)
+            .reshape(B, Nq, C)
+        )
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class DecoderBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        norm_mem=True,
+        rope=None,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            rope=rope,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.cross_attn = CrossAttention(
+            dim,
+            rope=rope,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.norm3 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+        self.norm_y = norm_layer(dim) if norm_mem else nn.Identity()
+
+    def forward(self, x, y, xpos, ypos):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        y_ = self.norm_y(y)
+        x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos))
+        x = x + self.drop_path(self.mlp(self.norm3(x)))
+        return x, y
+
+
+class CustomDecoderBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        norm_mem=True,
+        rope=None,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            rope=rope,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.cross_attn = CrossAttention(
+            dim,
+            rope=rope,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.norm3 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+        self.norm_y = norm_layer(dim) if norm_mem else nn.Identity()
+        self.norm_z = norm_layer(dim) if norm_mem else nn.Identity()
+
+    def forward(self, x, y, z, xpos, ypos):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        y_ = self.norm_y(y)
+        z_ = self.norm_z(z)
+        x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, z_, xpos, ypos))
+        x = x + self.drop_path(self.mlp(self.norm3(x)))
+        return x, y
+
+
+class ModLN(nn.Module):
+    """
+    Modulation with adaLN.
+
+    References:
+    DiT: https://github.com/facebookresearch/DiT/blob/main/models.py#L101
+    """
+
+    def __init__(self, inner_dim: int, mod_dim: int, eps: float):
+        super().__init__()
+        self.norm = nn.LayerNorm(inner_dim, eps=eps)
+        self.mlp = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(mod_dim, inner_dim * 2),
+        )
+
+    @staticmethod
+    def modulate(x, shift, scale):
+
+        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+    def forward(self, x: torch.Tensor, mod: torch.Tensor) -> torch.Tensor:
+        shift, scale = self.mlp(mod).chunk(2, dim=-1)  # [N, D]
+        return self.modulate(self.norm(x), shift, scale)  # [N, L, D]
+
+
+class ConditionModulationBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=partial(ModLN, eps=1e-6),
+        rope=None,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim, dim)
+        self.attn = Attention(
+            dim,
+            rope=rope,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim, dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+
+    def forward(self, x, mod, xpos):
+        x = x + self.drop_path(self.attn(self.norm1(x, mod), xpos))
+        x = x + self.drop_path(self.mlp(self.norm2(x, mod)))
+        return x
+
+
+class PositionGetter(object):
+    """return positions of patches"""
+
+    def __init__(self):
+        self.cache_positions = {}
+
+    def __call__(self, b, h, w, device):
+        if not (h, w) in self.cache_positions:
+            x = torch.arange(w, device=device)
+            y = torch.arange(h, device=device)
+            self.cache_positions[h, w] = torch.cartesian_prod(y, x)  # (h, w, 2)
+        pos = self.cache_positions[h, w].view(1, h * w, 2).expand(b, -1, 2).clone()
+        return pos
+
+
+class PatchEmbed(nn.Module):
+    """just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed"""
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+        self.position_getter = PositionGetter()
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        torch._assert(
+            H == self.img_size[0],
+            f"Input image height ({H}) doesn't match model ({self.img_size[0]}).",
+        )
+        torch._assert(
+            W == self.img_size[1],
+            f"Input image width ({W}) doesn't match model ({self.img_size[1]}).",
+        )
+        x = self.proj(x)
+        pos = self.position_getter(B, x.size(2), x.size(3), x.device)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x, pos
+
+    def _init_weights(self):
+        w = self.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+
+
+if __name__ == "__main__":
+    import os
+    import sys
+
+    sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+    import dust3r.utils.path_to_croco
+    from models.pos_embed import get_2d_sincos_pos_embed, RoPE2D
+    from functools import partial
+    from torch.utils.checkpoint import checkpoint
+
+    torch.manual_seed(0)
+
+    enc_blocks_ray_map = (
+        nn.ModuleList(
+            [
+                Block(
+                    768,
+                    16,
+                    4,
+                    qkv_bias=True,
+                    norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                    rope=RoPE2D(100),
+                )
+                for _ in range(2)
+            ]
+        )
+        .cuda()
+        .train()
+    )
+
+    x = torch.randn(2, 196, 768, requires_grad=True).cuda()
+    xpos = torch.arange(0, 196).unsqueeze(0).unsqueeze(-1).repeat(2, 1, 2).cuda().long()
+    enc_blocks_ray_map.zero_grad()
+    for blk in enc_blocks_ray_map:
+
+        x = checkpoint(blk, x, xpos)
+    enc_blocks_ray_map.zero_grad()
+    x.sum().backward()
+
+    grad_not_checkpointed = {}
+    for name, param in enc_blocks_ray_map.named_parameters():
+        grad_not_checkpointed[name] = param.grad.data.clone()
+        print(name, grad_not_checkpointed[name])
+        break
diff --git a/extern/CUT3R/src/dust3r/datasets/__init__.py b/extern/CUT3R/src/dust3r/datasets/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..b0c398bf60fb11059db1e8b171b33a8ef08c6a53
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/__init__.py
@@ -0,0 +1,86 @@
+from .utils.transforms import *
+from .base.batched_sampler import BatchedRandomSampler  # noqa
+from .arkitscenes import ARKitScenes_Multi  # noqa
+from .arkitscenes_highres import ARKitScenesHighRes_Multi
+from .bedlam import BEDLAM_Multi
+from .blendedmvs import BlendedMVS_Multi  # noqa
+from .co3d import Co3d_Multi  # noqa
+from .cop3d import Cop3D_Multi
+from .dl3dv import DL3DV_Multi
+from .dynamic_replica import DynamicReplica
+from .eden import EDEN_Multi
+from .hypersim import HyperSim_Multi
+from .hoi4d import HOI4D_Multi
+from .irs import IRS
+from .mapfree import MapFree_Multi
+from .megadepth import MegaDepth_Multi  # noqa
+from .mp3d import MP3D_Multi
+from .mvimgnet import MVImgNet_Multi
+from .mvs_synth import MVS_Synth_Multi
+from .omniobject3d import OmniObject3D_Multi
+from .pointodyssey import PointOdyssey_Multi
+from .realestate10k import RE10K_Multi
+from .scannet import ScanNet_Multi
+from .scannetpp import ScanNetpp_Multi  # noqa
+from .smartportraits import SmartPortraits_Multi
+from .spring import Spring
+from .synscapes import SynScapes
+from .tartanair import TartanAir_Multi
+from .threedkb import ThreeDKenBurns
+from .uasol import UASOL_Multi
+from .urbansyn import UrbanSyn
+from .unreal4k import UnReal4K_Multi
+from .vkitti2 import VirtualKITTI2_Multi  # noqa
+from .waymo import Waymo_Multi  # noqa
+from .wildrgbd import WildRGBD_Multi  # noqa
+
+
+from accelerate import Accelerator
+
+
+def get_data_loader(
+    dataset,
+    batch_size,
+    num_workers=8,
+    shuffle=True,
+    drop_last=True,
+    pin_mem=True,
+    accelerator: Accelerator = None,
+    fixed_length=False,
+):
+    import torch
+
+    # pytorch dataset
+    if isinstance(dataset, str):
+        dataset = eval(dataset)
+
+    try:
+        sampler = dataset.make_sampler(
+            batch_size,
+            shuffle=shuffle,
+            drop_last=drop_last,
+            world_size=accelerator.num_processes,
+            fixed_length=fixed_length
+        )
+        shuffle = False
+
+        data_loader = torch.utils.data.DataLoader(
+            dataset,
+            batch_sampler=sampler,
+            num_workers=num_workers,
+            pin_memory=pin_mem,
+        )
+
+    except (AttributeError, NotImplementedError):
+        sampler = None
+
+        data_loader = torch.utils.data.DataLoader(
+            dataset,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            num_workers=num_workers,
+            pin_memory=pin_mem,
+            drop_last=drop_last,
+        )
+
+    return data_loader
diff --git a/extern/CUT3R/src/dust3r/datasets/arkitscenes.py b/extern/CUT3R/src/dust3r/datasets/arkitscenes.py
new file mode 100644
index 0000000000000000000000000000000000000000..214ee4f9b5d0238c7a79ce48972b51cc4d1c2ab3
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/arkitscenes.py
@@ -0,0 +1,242 @@
+import os.path as osp
+import os
+import sys
+import itertools
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+import cv2
+import numpy as np
+
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+def stratified_sampling(indices, num_samples, rng=None):
+    if num_samples > len(indices):
+        raise ValueError("num_samples cannot exceed the number of available indices.")
+    elif num_samples == len(indices):
+        return indices
+
+    sorted_indices = sorted(indices)
+    stride = len(sorted_indices) / num_samples
+    sampled_indices = []
+    if rng is None:
+        rng = np.random.default_rng()
+
+    for i in range(num_samples):
+        start = int(i * stride)
+        end = int((i + 1) * stride)
+        # Ensure end does not exceed the list
+        end = min(end, len(sorted_indices))
+        if start < end:
+            # Randomly select within the current stratum
+            rand_idx = rng.integers(start, end)
+            sampled_indices.append(sorted_indices[rand_idx])
+        else:
+            # In case of any rounding issues, select the last index
+            sampled_indices.append(sorted_indices[-1])
+
+    return rng.permutation(sampled_indices)
+
+
+class ARKitScenes_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, split, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 8
+        super().__init__(*args, **kwargs)
+        if split == "train":
+            self.split = "Training"
+        elif split == "test":
+            self.split = "Test"
+        else:
+            raise ValueError("")
+
+        self.loaded_data = self._load_data(self.split)
+
+    def _load_data(self, split):
+        with np.load(osp.join(self.ROOT, split, "all_metadata.npz")) as data:
+            self.scenes: np.ndarray = data["scenes"]
+            high_res_list = np.array(
+                [
+                    d
+                    for d in os.listdir(
+                        os.path.join(
+                            self.ROOT.rstrip("/") + "_highres",
+                            split if split == "Training" else "Validation",
+                        )
+                    )
+                    if os.path.join(self.ROOT + "_highres", split, d)
+                ]
+            )
+            self.scenes = np.setdiff1d(self.scenes, high_res_list)
+        offset = 0
+        counts = []
+        scenes = []
+        sceneids = []
+        images = []
+        intrinsics = []
+        trajectories = []
+        groups = []
+        id_ranges = []
+        j = 0
+        for scene_idx, scene in enumerate(self.scenes):
+            scene_dir = osp.join(self.ROOT, self.split, scene)
+            with np.load(
+                osp.join(scene_dir, "new_scene_metadata.npz"), allow_pickle=True
+            ) as data:
+                imgs = data["images"]
+                intrins = data["intrinsics"]
+                traj = data["trajectories"]
+                min_seq_len = (
+                    self.num_views
+                    if not self.allow_repeat
+                    else max(self.num_views // 3, 3)
+                )
+                if len(imgs) < min_seq_len:
+                    print(f"Skipping {scene}")
+                    continue
+
+                collections = {}
+                assert "image_collection" in data, "Image collection not found"
+                collections["image"] = data["image_collection"]
+
+                num_imgs = imgs.shape[0]
+                img_groups = []
+                min_group_len = (
+                    self.num_views
+                    if not self.allow_repeat
+                    else max(self.num_views // 3, 3)
+                )
+                for ref_id, group in collections["image"].item().items():
+                    if len(group) + 1 < min_group_len:
+                        continue
+
+                    # groups are (idx, score)s
+                    group.insert(0, (ref_id, 1.0))
+                    group = [int(x[0] + offset) for x in group]
+                    img_groups.append(sorted(group))
+
+                if len(img_groups) == 0:
+                    print(f"Skipping {scene}")
+                    continue
+
+                scenes.append(scene)
+                sceneids.extend([j] * num_imgs)
+                id_ranges.extend([(offset, offset + num_imgs) for _ in range(num_imgs)])
+                images.extend(imgs)
+                K = np.expand_dims(np.eye(3), 0).repeat(num_imgs, 0)
+
+                K[:, 0, 0] = [fx for _, _, fx, _, _, _ in intrins]
+                K[:, 1, 1] = [fy for _, _, _, fy, _, _ in intrins]
+                K[:, 0, 2] = [cx for _, _, _, _, cx, _ in intrins]
+                K[:, 1, 2] = [cy for _, _, _, _, _, cy in intrins]
+                intrinsics.extend(list(K))
+                trajectories.extend(list(traj))
+
+                # offset groups
+                groups.extend(img_groups)
+                counts.append(offset)
+                offset += num_imgs
+                j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.id_ranges = id_ranges
+        self.images = images
+        self.intrinsics = intrinsics
+        self.trajectories = trajectories
+        self.groups = groups
+
+    def __len__(self):
+        return len(self.groups)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+
+        if rng.choice([True, False]):
+            image_idxs = np.arange(self.id_ranges[idx][0], self.id_ranges[idx][1])
+            cut_off = num_views if not self.allow_repeat else max(num_views // 3, 3)
+            start_image_idxs = image_idxs[: len(image_idxs) - cut_off + 1]
+            start_id = rng.choice(start_image_idxs)
+            pos, ordered_video = self.get_seq_from_start_id(
+                num_views,
+                start_id,
+                image_idxs.tolist(),
+                rng,
+                max_interval=self.max_interval,
+                video_prob=0.8,
+                fix_interval_prob=0.5,
+                block_shuffle=16,
+            )
+            image_idxs = np.array(image_idxs)[pos]
+        else:
+            ordered_video = False
+            image_idxs = self.groups[idx]
+            image_idxs = rng.permutation(image_idxs)
+            if len(image_idxs) > num_views:
+                image_idxs = image_idxs[:num_views]
+            else:
+                if rng.random() < 0.8:
+                    image_idxs = rng.choice(image_idxs, size=num_views, replace=True)
+                else:
+                    repeat_num = num_views // len(image_idxs) + 1
+                    image_idxs = np.tile(image_idxs, repeat_num)[:num_views]
+
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.split, self.scenes[scene_id])
+
+            intrinsics = self.intrinsics[view_idx]
+            camera_pose = self.trajectories[view_idx]
+            basename = self.images[view_idx]
+            assert (
+                basename[:8] == self.scenes[scene_id]
+            ), f"{basename}, {self.scenes[scene_id]}"
+            # print(scene_dir, basename)
+            # Load RGB image
+            rgb_image = imread_cv2(
+                osp.join(scene_dir, "vga_wide", basename.replace(".png", ".jpg"))
+            )
+            # Load depthmap
+            depthmap = imread_cv2(
+                osp.join(scene_dir, "lowres_depth", basename), cv2.IMREAD_UNCHANGED
+            )
+            depthmap = depthmap.astype(np.float32) / 1000.0
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="arkitscenes",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(0.98, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/arkitscenes_highres.py b/extern/CUT3R/src/dust3r/datasets/arkitscenes_highres.py
new file mode 100755
index 0000000000000000000000000000000000000000..92826e1c46a067ed93ffc30d0470685085377bf6
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/arkitscenes_highres.py
@@ -0,0 +1,175 @@
+import os.path as osp
+import os
+import sys
+import itertools
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+import cv2
+import numpy as np
+import h5py
+import math
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class ARKitScenesHighRes_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, split, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.max_interval = 8
+        self.is_metric = True
+        super().__init__(*args, **kwargs)
+        if split == "train":
+            self.split = "Training"
+        elif split == "test":
+            self.split = "Validation"
+        else:
+            raise ValueError("")
+
+        self.loaded_data = self._load_data(self.split)
+
+    def _load_data(self, split):
+        all_scenes = sorted(
+            [
+                d
+                for d in os.listdir(osp.join(self.ROOT, split))
+                if osp.isdir(osp.join(self.ROOT, split, d))
+            ]
+        )
+        offset = 0
+        scenes = []
+        sceneids = []
+        images = []
+        start_img_ids = []
+        scene_img_list = []
+        timestamps = []
+        intrinsics = []
+        trajectories = []
+        scene_id = 0
+        for scene in all_scenes:
+            scene_dir = osp.join(self.ROOT, self.split, scene)
+            with np.load(osp.join(scene_dir, "scene_metadata.npz")) as data:
+                imgs_with_indices = sorted(
+                    enumerate(data["images"]), key=lambda x: x[1]
+                )
+                imgs = [x[1] for x in imgs_with_indices]
+                cut_off = (
+                    self.num_views
+                    if not self.allow_repeat
+                    else max(self.num_views // 3, 3)
+                )
+                if len(imgs) < cut_off:
+                    print(f"Skipping {scene}")
+                    continue
+                indices = [x[0] for x in imgs_with_indices]
+                tsps = np.array(
+                    [float(img_name.split("_")[1][:-4]) for img_name in imgs]
+                )
+                assert [img[:8] == scene for img in imgs], f"{scene}, {imgs}"
+                num_imgs = data["images"].shape[0]
+                img_ids = list(np.arange(num_imgs) + offset)
+                start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+
+                scenes.append(scene)
+                scene_img_list.append(img_ids)
+                sceneids.extend([scene_id] * num_imgs)
+                images.extend(imgs)
+                start_img_ids.extend(start_img_ids_)
+                timestamps.extend(tsps)
+
+                K = np.expand_dims(np.eye(3), 0).repeat(num_imgs, 0)
+                intrins = data["intrinsics"][indices]
+                K[:, 0, 0] = [fx for _, _, fx, _, _, _ in intrins]
+                K[:, 1, 1] = [fy for _, _, _, fy, _, _ in intrins]
+                K[:, 0, 2] = [cx for _, _, _, _, cx, _ in intrins]
+                K[:, 1, 2] = [cy for _, _, _, _, _, cy in intrins]
+                intrinsics.extend(list(K))
+                trajectories.extend(list(data["trajectories"][indices]))
+
+                # offset groups
+                offset += num_imgs
+                scene_id += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.scene_img_list = scene_img_list
+        self.intrinsics = intrinsics
+        self.trajectories = trajectories
+        self.start_img_ids = start_img_ids
+        assert len(self.images) == len(self.intrinsics) == len(self.trajectories)
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            block_shuffle=16,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+
+        views = []
+
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.split, self.scenes[scene_id])
+
+            intrinsics = self.intrinsics[view_idx]
+            camera_pose = self.trajectories[view_idx]
+            basename = self.images[view_idx]
+            assert (
+                basename[:8] == self.scenes[scene_id]
+            ), f"{basename}, {self.scenes[scene_id]}"
+            # print(scene_dir, basename)
+            # Load RGB image
+            rgb_image = imread_cv2(
+                osp.join(scene_dir, "vga_wide", basename.replace(".png", ".jpg"))
+            )
+            # Load depthmap
+            depthmap = imread_cv2(
+                osp.join(scene_dir, "highres_depth", basename), cv2.IMREAD_UNCHANGED
+            )
+            depthmap = depthmap.astype(np.float32) / 1000.0
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.7, 0.25, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="arkitscenes_highres",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(0.99, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/base/__init__.py b/extern/CUT3R/src/dust3r/datasets/base/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/extern/CUT3R/src/dust3r/datasets/base/base_multiview_dataset.py b/extern/CUT3R/src/dust3r/datasets/base/base_multiview_dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..dc04725a6d22e4c7685cee3e1fb75856803df7e7
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/base/base_multiview_dataset.py
@@ -0,0 +1,546 @@
+import PIL
+import numpy as np
+import torch
+import random
+import itertools
+from dust3r.datasets.base.easy_dataset import EasyDataset
+from dust3r.datasets.utils.transforms import ImgNorm, SeqColorJitter
+from dust3r.utils.geometry import depthmap_to_absolute_camera_coordinates
+import dust3r.datasets.utils.cropping as cropping
+from dust3r.datasets.utils.corr import extract_correspondences_from_pts3d
+
+
+def get_ray_map(c2w1, c2w2, intrinsics, h, w):
+    c2w = np.linalg.inv(c2w1) @ c2w2
+    i, j = np.meshgrid(np.arange(w), np.arange(h), indexing="xy")
+    grid = np.stack([i, j, np.ones_like(i)], axis=-1)
+    ro = c2w[:3, 3]
+    rd = np.linalg.inv(intrinsics) @ grid.reshape(-1, 3).T
+    rd = (c2w @ np.vstack([rd, np.ones_like(rd[0])])).T[:, :3].reshape(h, w, 3)
+    rd = rd / np.linalg.norm(rd, axis=-1, keepdims=True)
+    ro = np.broadcast_to(ro, (h, w, 3))
+    ray_map = np.concatenate([ro, rd], axis=-1)
+    return ray_map
+
+
+class BaseMultiViewDataset(EasyDataset):
+    """Define all basic options.
+
+    Usage:
+        class MyDataset (BaseMultiViewDataset):
+            def _get_views(self, idx, rng):
+                # overload here
+                views = []
+                views.append(dict(img=, ...))
+                return views
+    """
+
+    def __init__(
+        self,
+        *,  # only keyword arguments
+        num_views=None,
+        split=None,
+        resolution=None,  # square_size or (width, height) or list of [(width,height), ...]
+        transform=ImgNorm,
+        aug_crop=False,
+        n_corres=0,
+        nneg=0,
+        seed=None,
+        allow_repeat=False,
+        seq_aug_crop=False,
+    ):
+        assert num_views is not None, "undefined num_views"
+        self.num_views = num_views
+        self.split = split
+        self._set_resolutions(resolution)
+
+        self.n_corres = n_corres
+        self.nneg = nneg
+        assert (
+            self.n_corres == "all"
+            or isinstance(self.n_corres, int)
+            or (
+                isinstance(self.n_corres, list) and len(self.n_corres) == self.num_views
+            )
+        ), f"Error, n_corres should either be 'all', a single integer or a list of length {self.num_views}"
+        assert (
+            self.nneg == 0 or self.n_corres != "all"
+        ), "nneg should be 0 if n_corres is all"
+
+        self.is_seq_color_jitter = False
+        if isinstance(transform, str):
+            transform = eval(transform)
+        if transform == SeqColorJitter:
+            transform = SeqColorJitter()
+            self.is_seq_color_jitter = True
+        self.transform = transform
+
+        self.aug_crop = aug_crop
+        self.seed = seed
+        self.allow_repeat = allow_repeat
+        self.seq_aug_crop = seq_aug_crop
+
+    def __len__(self):
+        return len(self.scenes)
+
+    @staticmethod
+    def efficient_random_intervals(
+        start,
+        num_elements,
+        interval_range,
+        fixed_interval_prob=0.8,
+        weights=None,
+        seed=42,
+    ):
+        if random.random() < fixed_interval_prob:
+            intervals = random.choices(interval_range, weights=weights) * (
+                num_elements - 1
+            )
+        else:
+            intervals = [
+                random.choices(interval_range, weights=weights)[0]
+                for _ in range(num_elements - 1)
+            ]
+        return list(itertools.accumulate([start] + intervals))
+
+    def sample_based_on_timestamps(self, i, timestamps, num_views, interval=1):
+        time_diffs = np.abs(timestamps - timestamps[i])
+        ids_candidate = np.where(time_diffs < interval)[0]
+        ids_candidate = np.sort(ids_candidate)
+        if (self.allow_repeat and len(ids_candidate) < num_views // 3) or (
+            len(ids_candidate) < num_views
+        ):
+            return []
+        ids_sel_list = []
+        ids_candidate_left = ids_candidate.copy()
+        while len(ids_candidate_left) >= num_views:
+            ids_sel = np.random.choice(ids_candidate_left, num_views, replace=False)
+            ids_sel_list.append(sorted(ids_sel))
+            ids_candidate_left = np.setdiff1d(ids_candidate_left, ids_sel)
+
+        if len(ids_candidate_left) > 0 and len(ids_candidate) >= num_views:
+            ids_sel = np.concatenate(
+                [
+                    ids_candidate_left,
+                    np.random.choice(
+                        np.setdiff1d(ids_candidate, ids_candidate_left),
+                        num_views - len(ids_candidate_left),
+                        replace=False,
+                    ),
+                ]
+            )
+            ids_sel_list.append(sorted(ids_sel))
+
+        if self.allow_repeat:
+            ids_sel_list.append(
+                sorted(np.random.choice(ids_candidate, num_views, replace=True))
+            )
+
+        # add sequences with fixed intervals (all possible intervals)
+        pos_i = np.where(ids_candidate == i)[0][0]
+        curr_interval = 1
+        stop = len(ids_candidate) < num_views
+        while not stop:
+            pos_sel = [pos_i]
+            count = 0
+            while len(pos_sel) < num_views:
+                if count % 2 == 0:
+                    curr_pos_i = pos_sel[-1] + curr_interval
+                    if curr_pos_i >= len(ids_candidate):
+                        stop = True
+                        break
+                    pos_sel.append(curr_pos_i)
+                else:
+                    curr_pos_i = pos_sel[0] - curr_interval
+                    if curr_pos_i < 0:
+                        stop = True
+                        break
+                    pos_sel.insert(0, curr_pos_i)
+                count += 1
+            if not stop and len(pos_sel) == num_views:
+                ids_sel = sorted([ids_candidate[pos] for pos in pos_sel])
+                if ids_sel not in ids_sel_list:
+                    ids_sel_list.append(ids_sel)
+            curr_interval += 1
+        return ids_sel_list
+
+    @staticmethod
+    def blockwise_shuffle(x, rng, block_shuffle):
+        if block_shuffle is None:
+            return rng.permutation(x).tolist()
+        else:
+            assert block_shuffle > 0
+            blocks = [x[i : i + block_shuffle] for i in range(0, len(x), block_shuffle)]
+            shuffled_blocks = [rng.permutation(block).tolist() for block in blocks]
+            shuffled_list = [item for block in shuffled_blocks for item in block]
+            return shuffled_list
+
+    def get_seq_from_start_id(
+        self,
+        num_views,
+        id_ref,
+        ids_all,
+        rng,
+        min_interval=1,
+        max_interval=25,
+        video_prob=0.5,
+        fix_interval_prob=0.5,
+        block_shuffle=None,
+    ):
+        """
+        args:
+            num_views: number of views to return
+            id_ref: the reference id (first id)
+            ids_all: all the ids
+            rng: random number generator
+            max_interval: maximum interval between two views
+        returns:
+            pos: list of positions of the views in ids_all, i.e., index for ids_all
+            is_video: True if the views are consecutive
+        """
+        assert min_interval > 0, f"min_interval should be > 0, got {min_interval}"
+        assert (
+            min_interval <= max_interval
+        ), f"min_interval should be <= max_interval, got {min_interval} and {max_interval}"
+        assert id_ref in ids_all
+        pos_ref = ids_all.index(id_ref)
+        all_possible_pos = np.arange(pos_ref, len(ids_all))
+
+        remaining_sum = len(ids_all) - 1 - pos_ref
+
+        if remaining_sum >= num_views - 1:
+            if remaining_sum == num_views - 1:
+                assert ids_all[-num_views] == id_ref
+                return [pos_ref + i for i in range(num_views)], True
+            max_interval = min(max_interval, 2 * remaining_sum // (num_views - 1))
+            intervals = [
+                rng.choice(range(min_interval, max_interval + 1))
+                for _ in range(num_views - 1)
+            ]
+
+            # if video or collection
+            if rng.random() < video_prob:
+                # if fixed interval or random
+                if rng.random() < fix_interval_prob:
+                    # regular interval
+                    fixed_interval = rng.choice(
+                        range(
+                            1,
+                            min(remaining_sum // (num_views - 1) + 1, max_interval + 1),
+                        )
+                    )
+                    intervals = [fixed_interval for _ in range(num_views - 1)]
+                is_video = True
+            else:
+                is_video = False
+
+            pos = list(itertools.accumulate([pos_ref] + intervals))
+            pos = [p for p in pos if p < len(ids_all)]
+            pos_candidates = [p for p in all_possible_pos if p not in pos]
+            pos = (
+                pos
+                + rng.choice(
+                    pos_candidates, num_views - len(pos), replace=False
+                ).tolist()
+            )
+
+            pos = (
+                sorted(pos)
+                if is_video
+                else self.blockwise_shuffle(pos, rng, block_shuffle)
+            )
+        else:
+            # assert self.allow_repeat
+            uniq_num = remaining_sum
+            new_pos_ref = rng.choice(np.arange(pos_ref + 1))
+            new_remaining_sum = len(ids_all) - 1 - new_pos_ref
+            new_max_interval = min(max_interval, new_remaining_sum // (uniq_num - 1))
+            new_intervals = [
+                rng.choice(range(1, new_max_interval + 1)) for _ in range(uniq_num - 1)
+            ]
+
+            revisit_random = rng.random()
+            video_random = rng.random()
+
+            if rng.random() < fix_interval_prob and video_random < video_prob:
+                # regular interval
+                fixed_interval = rng.choice(range(1, new_max_interval + 1))
+                new_intervals = [fixed_interval for _ in range(uniq_num - 1)]
+            pos = list(itertools.accumulate([new_pos_ref] + new_intervals))
+
+            is_video = False
+            if revisit_random < 0.5 or video_prob == 1.0:  # revisit, video / collection
+                is_video = video_random < video_prob
+                pos = (
+                    self.blockwise_shuffle(pos, rng, block_shuffle)
+                    if not is_video
+                    else pos
+                )
+                num_full_repeat = num_views // uniq_num
+                pos = (
+                    pos * num_full_repeat
+                    + pos[: num_views - len(pos) * num_full_repeat]
+                )
+            elif revisit_random < 0.9:  # random
+                pos = rng.choice(pos, num_views, replace=True)
+            else:  # ordered
+                pos = sorted(rng.choice(pos, num_views, replace=True))
+        assert len(pos) == num_views
+        return pos, is_video
+
+    def get_img_and_ray_masks(self, is_metric, v, rng, p=[0.8, 0.15, 0.05]):
+        # generate img mask and raymap mask
+        if v == 0 or (not is_metric):
+            img_mask = True
+            raymap_mask = False
+        else:
+            rand_val = rng.random()
+            if rand_val < p[0]:
+                img_mask = True
+                raymap_mask = False
+            elif rand_val < p[0] + p[1]:
+                img_mask = False
+                raymap_mask = True
+            else:
+                img_mask = True
+                raymap_mask = True
+        return img_mask, raymap_mask
+
+    def get_stats(self):
+        return f"{len(self)} groups of views"
+
+    def __repr__(self):
+        resolutions_str = "[" + ";".join(f"{w}x{h}" for w, h in self._resolutions) + "]"
+        return (
+            f"""{type(self).__name__}({self.get_stats()},
+            {self.num_views=},
+            {self.split=},
+            {self.seed=},
+            resolutions={resolutions_str},
+            {self.transform=})""".replace(
+                "self.", ""
+            )
+            .replace("\n", "")
+            .replace("   ", "")
+        )
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        raise NotImplementedError()
+
+    def __getitem__(self, idx):
+        # print("Receiving:" , idx)
+        if isinstance(idx, (tuple, list, np.ndarray)):
+            # the idx is specifying the aspect-ratio
+            idx, ar_idx, nview = idx
+        else:
+            assert len(self._resolutions) == 1
+            ar_idx = 0
+            nview = self.num_views
+
+        assert nview >= 1 and nview <= self.num_views
+        # set-up the rng
+        if self.seed:  # reseed for each __getitem__
+            self._rng = np.random.default_rng(seed=self.seed + idx)
+        elif not hasattr(self, "_rng"):
+            seed = torch.randint(0, 2**32, (1,)).item()
+            self._rng = np.random.default_rng(seed=seed)
+
+        if self.aug_crop > 1 and self.seq_aug_crop:
+            self.delta_target_resolution = self._rng.integers(0, self.aug_crop)
+
+        # over-loaded code
+        resolution = self._resolutions[
+            ar_idx
+        ]  # DO NOT CHANGE THIS (compatible with BatchedRandomSampler)
+        views = self._get_views(idx, resolution, self._rng, nview)
+        assert len(views) == nview
+
+        if "camera_pose" not in views[0]:
+            views[0]["camera_pose"] = np.ones((4, 4), dtype=np.float32)
+        first_view_camera_pose = views[0]["camera_pose"]
+        transform = SeqColorJitter() if self.is_seq_color_jitter else self.transform
+
+        for v, view in enumerate(views):
+            assert (
+                "pts3d" not in view
+            ), f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}"
+            view["idx"] = (idx, ar_idx, v)
+
+            # encode the image
+            width, height = view["img"].size
+
+            view["true_shape"] = np.int32((height, width))
+            view["img"] = transform(view["img"])
+            view["sky_mask"] = view["depthmap"] < 0
+
+            assert "camera_intrinsics" in view
+            if "camera_pose" not in view:
+                view["camera_pose"] = np.full((4, 4), np.nan, dtype=np.float32)
+            else:
+                assert np.isfinite(
+                    view["camera_pose"]
+                ).all(), f"NaN in camera pose for view {view_name(view)}"
+
+            ray_map = get_ray_map(
+                first_view_camera_pose,
+                view["camera_pose"],
+                view["camera_intrinsics"],
+                height,
+                width,
+            )
+            view["ray_map"] = ray_map.astype(np.float32)
+
+            assert "pts3d" not in view
+            assert "valid_mask" not in view
+            assert np.isfinite(
+                view["depthmap"]
+            ).all(), f"NaN in depthmap for view {view_name(view)}"
+            pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view)
+
+            view["pts3d"] = pts3d
+            view["valid_mask"] = valid_mask & np.isfinite(pts3d).all(axis=-1)
+
+            # check all datatypes
+            for key, val in view.items():
+                res, err_msg = is_good_type(key, val)
+                assert res, f"{err_msg} with {key}={val} for view {view_name(view)}"
+            K = view["camera_intrinsics"]
+
+        if self.n_corres > 0:
+            ref_view = views[0]
+            for view in views:
+                corres1, corres2, valid = extract_correspondences_from_pts3d(
+                    ref_view, view, self.n_corres, self._rng, nneg=self.nneg
+                )
+                view["corres"] = (corres1, corres2)
+                view["valid_corres"] = valid
+
+        # last thing done!
+        for view in views:
+            view["rng"] = int.from_bytes(self._rng.bytes(4), "big")
+        return views
+
+    def _set_resolutions(self, resolutions):
+        assert resolutions is not None, "undefined resolution"
+
+        if not isinstance(resolutions, list):
+            resolutions = [resolutions]
+
+        self._resolutions = []
+        for resolution in resolutions:
+            if isinstance(resolution, int):
+                width = height = resolution
+            else:
+                width, height = resolution
+            assert isinstance(
+                width, int
+            ), f"Bad type for {width=} {type(width)=}, should be int"
+            assert isinstance(
+                height, int
+            ), f"Bad type for {height=} {type(height)=}, should be int"
+            self._resolutions.append((width, height))
+
+    def _crop_resize_if_necessary(
+        self, image, depthmap, intrinsics, resolution, rng=None, info=None
+    ):
+        """This function:
+        - first downsizes the image with LANCZOS inteprolation,
+          which is better than bilinear interpolation in
+        """
+        if not isinstance(image, PIL.Image.Image):
+            image = PIL.Image.fromarray(image)
+
+        # downscale with lanczos interpolation so that image.size == resolution
+        # cropping centered on the principal point
+        W, H = image.size
+        cx, cy = intrinsics[:2, 2].round().astype(int)
+        min_margin_x = min(cx, W - cx)
+        min_margin_y = min(cy, H - cy)
+        assert min_margin_x > W / 5, f"Bad principal point in view={info}"
+        assert min_margin_y > H / 5, f"Bad principal point in view={info}"
+        # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
+        l, t = cx - min_margin_x, cy - min_margin_y
+        r, b = cx + min_margin_x, cy + min_margin_y
+        crop_bbox = (l, t, r, b)
+        image, depthmap, intrinsics = cropping.crop_image_depthmap(
+            image, depthmap, intrinsics, crop_bbox
+        )
+
+        # transpose the resolution if necessary
+        W, H = image.size  # new size
+
+        # high-quality Lanczos down-scaling
+        target_resolution = np.array(resolution)
+        if self.aug_crop > 1:
+            target_resolution += (
+                rng.integers(0, self.aug_crop)
+                if not self.seq_aug_crop
+                else self.delta_target_resolution
+            )
+        image, depthmap, intrinsics = cropping.rescale_image_depthmap(
+            image, depthmap, intrinsics, target_resolution
+        )
+
+        # actual cropping (if necessary) with bilinear interpolation
+        intrinsics2 = cropping.camera_matrix_of_crop(
+            intrinsics, image.size, resolution, offset_factor=0.5
+        )
+        crop_bbox = cropping.bbox_from_intrinsics_in_out(
+            intrinsics, intrinsics2, resolution
+        )
+        image, depthmap, intrinsics2 = cropping.crop_image_depthmap(
+            image, depthmap, intrinsics, crop_bbox
+        )
+
+        return image, depthmap, intrinsics2
+
+
+def is_good_type(key, v):
+    """returns (is_good, err_msg)"""
+    if isinstance(v, (str, int, tuple)):
+        return True, None
+    if v.dtype not in (np.float32, torch.float32, bool, np.int32, np.int64, np.uint8):
+        return False, f"bad {v.dtype=}"
+    return True, None
+
+
+def view_name(view, batch_index=None):
+    def sel(x):
+        return x[batch_index] if batch_index not in (None, slice(None)) else x
+
+    db = sel(view["dataset"])
+    label = sel(view["label"])
+    instance = sel(view["instance"])
+    return f"{db}/{label}/{instance}"
+
+
+def transpose_to_landscape(view):
+    height, width = view["true_shape"]
+
+    if width < height:
+        # rectify portrait to landscape
+        assert view["img"].shape == (3, height, width)
+        view["img"] = view["img"].swapaxes(1, 2)
+
+        assert view["valid_mask"].shape == (height, width)
+        view["valid_mask"] = view["valid_mask"].swapaxes(0, 1)
+
+        assert view["depthmap"].shape == (height, width)
+        view["depthmap"] = view["depthmap"].swapaxes(0, 1)
+
+        assert view["pts3d"].shape == (height, width, 3)
+        view["pts3d"] = view["pts3d"].swapaxes(0, 1)
+
+        # transpose x and y pixels
+        view["camera_intrinsics"] = view["camera_intrinsics"][[1, 0, 2]]
+
+        assert view["ray_map"].shape == (height, width, 6)
+        view["ray_map"] = view["ray_map"].swapaxes(0, 1)
+
+        assert view["sky_mask"].shape == (height, width)
+        view["sky_mask"] = view["sky_mask"].swapaxes(0, 1)
+
+        if "corres" in view:
+            # transpose correspondences x and y
+            view["corres"][0] = view["corres"][0][:, [1, 0]]
+            view["corres"][1] = view["corres"][1][:, [1, 0]]
diff --git a/extern/CUT3R/src/dust3r/datasets/base/batched_sampler.py b/extern/CUT3R/src/dust3r/datasets/base/batched_sampler.py
new file mode 100755
index 0000000000000000000000000000000000000000..0fb65a1142c510d193dcae3dd4f30679223646a2
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/base/batched_sampler.py
@@ -0,0 +1,94 @@
+import numpy as np
+import torch
+from accelerate import Accelerator
+import torch.utils
+from torch.utils.data import BatchSampler, Sampler
+import torch.utils.data
+
+
+class CustomRandomSampler(Sampler):
+    """Random sampling under a constraint: each sample in the batch has the same feature,
+    which is chosen randomly from a known pool of 'features' for each batch.
+
+    For instance, the 'feature' could be the image aspect-ratio.
+
+    The index returned is a tuple (sample_idx, feat_idx).
+    This sampler ensures that each series of `batch_size` indices has the same `feat_idx`.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        batch_size,
+        pool_size,
+        min_view_size,
+        max_view_size,
+        world_size,
+        warmup=1,
+        drop_last=True,
+    ):
+        self.batch_size = batch_size
+        self.pool_size = pool_size
+        self.min_view_size = min_view_size
+        self.max_view_size = max_view_size
+        self.drop_last = drop_last
+        self.len_dataset = N = len(dataset)
+        self.total_size = N
+
+        self.epoch = None
+        self.epochf = 0.0
+
+    def __len__(self):
+        return self.total_size
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+    def __iter__(self):
+        if self.epoch is None:
+            raise ValueError(
+                "Epoch number not set. Please call 'set_epoch(epoch)' before iterating."
+            )
+
+        seed = self.epoch + 788
+        rng = np.random.default_rng(seed=seed)
+        # random indices (will restart from 0 if not drop_last)
+        sample_idxs = np.arange(self.total_size)
+        rng.shuffle(sample_idxs)
+        # random feat_idxs (same across each batch)
+        n_batches = (self.total_size + self.batch_size - 1) // self.batch_size
+        if self.pool_size > 1:
+            p = np.ones(self.pool_size)
+            p[: self.pool_size // 2] *= 2
+            p = p / p.sum()
+            _feat_idxs = rng.choice(self.pool_size, size=n_batches, p=p)
+        else:
+            _feat_idxs = rng.integers(self.pool_size, size=n_batches)
+        _feat_idxs = np.broadcast_to(_feat_idxs[:, None], (n_batches, self.batch_size))
+        _feat_idxs = _feat_idxs.ravel()[: self.total_size]
+        _view_idxs = rng.integers(
+            self.min_view_size, self.max_view_size + 1, size=n_batches
+        )
+        _view_idxs = np.broadcast_to(_view_idxs[:, None], (n_batches, self.batch_size))
+        _view_idxs = _view_idxs.ravel()[: self.total_size]
+
+        idxs = np.c_[sample_idxs, _feat_idxs, _view_idxs]
+        yield from (tuple(idx) for idx in idxs)
+
+
+class BatchedRandomSampler(BatchSampler):
+    """Batch sampler that groups indices from RandomSampler into batches."""
+
+    def __init__(self, sampler: CustomRandomSampler, batch_size, drop_last=True):
+        self.sampler = sampler  # An instance of RandomSampler
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+
+    def set_epoch(self, epoch):
+        self.sampler.set_epoch(epoch)
+
+
+def round_by(total, multiple, up=False):
+    if up:
+        total = total + multiple - 1
+    return (total // multiple) * multiple
diff --git a/extern/CUT3R/src/dust3r/datasets/base/easy_dataset.py b/extern/CUT3R/src/dust3r/datasets/base/easy_dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..67a57f57e89337727b732db348abafaa9a1a4335
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/base/easy_dataset.py
@@ -0,0 +1,198 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+import numpy as np
+from dust3r.datasets.base.batched_sampler import (
+    BatchedRandomSampler,
+    CustomRandomSampler,
+)
+import torch
+
+
+class EasyDataset:
+    """a dataset that you can easily resize and combine.
+    Examples:
+    ---------
+        2 * dataset ==> duplicate each element 2x
+
+        10 @ dataset ==> set the size to 10 (random sampling, duplicates if necessary)
+
+        dataset1 + dataset2 ==> concatenate datasets
+    """
+
+    def __add__(self, other):
+        return CatDataset([self, other])
+
+    def __rmul__(self, factor):
+        return MulDataset(factor, self)
+
+    def __rmatmul__(self, factor):
+        return ResizedDataset(factor, self)
+
+    def set_epoch(self, epoch):
+        pass  # nothing to do by default
+
+    def make_sampler(
+        self, batch_size, shuffle=True, drop_last=True, world_size=1, rank=0, fixed_length=False
+    ):
+        if not (shuffle):
+            raise NotImplementedError()  # cannot deal yet
+        num_of_aspect_ratios = len(self._resolutions)
+        num_of_views = self.num_views
+        sampler = CustomRandomSampler(
+            self,
+            batch_size,
+            num_of_aspect_ratios,
+            4 if not fixed_length else num_of_views,
+            num_of_views,
+            world_size,
+            warmup=1,
+            drop_last=drop_last,
+        )
+        return BatchedRandomSampler(sampler, batch_size, drop_last)
+
+
+class MulDataset(EasyDataset):
+    """Artifically augmenting the size of a dataset."""
+
+    multiplicator: int
+
+    def __init__(self, multiplicator, dataset):
+        assert isinstance(multiplicator, int) and multiplicator > 0
+        self.multiplicator = multiplicator
+        self.dataset = dataset
+
+    def __len__(self):
+        return self.multiplicator * len(self.dataset)
+
+    def __repr__(self):
+        return f"{self.multiplicator}*{repr(self.dataset)}"
+
+    def __getitem__(self, idx):
+        if isinstance(idx, tuple):
+            idx, other, another = idx
+            return self.dataset[idx // self.multiplicator, other, another]
+        else:
+            return self.dataset[idx // self.multiplicator]
+
+    @property
+    def _resolutions(self):
+        return self.dataset._resolutions
+
+    @property
+    def num_views(self):
+        return self.dataset.num_views
+
+
+class ResizedDataset(EasyDataset):
+    """Artifically changing the size of a dataset."""
+
+    new_size: int
+
+    def __init__(self, new_size, dataset):
+        assert isinstance(new_size, int) and new_size > 0
+        self.new_size = new_size
+        self.dataset = dataset
+
+    def __len__(self):
+        return self.new_size
+
+    def __repr__(self):
+        size_str = str(self.new_size)
+        for i in range((len(size_str) - 1) // 3):
+            sep = -4 * i - 3
+            size_str = size_str[:sep] + "_" + size_str[sep:]
+        return f"{size_str} @ {repr(self.dataset)}"
+
+    def set_epoch(self, epoch):
+        # this random shuffle only depends on the epoch
+        rng = np.random.default_rng(seed=epoch + 777)
+
+        # shuffle all indices
+        perm = rng.permutation(len(self.dataset))
+
+        # rotary extension until target size is met
+        shuffled_idxs = np.concatenate(
+            [perm] * (1 + (len(self) - 1) // len(self.dataset))
+        )
+        self._idxs_mapping = shuffled_idxs[: self.new_size]
+
+        assert len(self._idxs_mapping) == self.new_size
+
+    def __getitem__(self, idx):
+        assert hasattr(
+            self, "_idxs_mapping"
+        ), "You need to call dataset.set_epoch() to use ResizedDataset.__getitem__()"
+        if isinstance(idx, tuple):
+            idx, other, another = idx
+            return self.dataset[self._idxs_mapping[idx], other, another]
+        else:
+            return self.dataset[self._idxs_mapping[idx]]
+
+    @property
+    def _resolutions(self):
+        return self.dataset._resolutions
+
+    @property
+    def num_views(self):
+        return self.dataset.num_views
+
+
+class CatDataset(EasyDataset):
+    """Concatenation of several datasets"""
+
+    def __init__(self, datasets):
+        for dataset in datasets:
+            assert isinstance(dataset, EasyDataset)
+        self.datasets = datasets
+        self._cum_sizes = np.cumsum([len(dataset) for dataset in datasets])
+
+    def __len__(self):
+        return self._cum_sizes[-1]
+
+    def __repr__(self):
+        # remove uselessly long transform
+        return " + ".join(
+            repr(dataset).replace(
+                ",transform=Compose( ToTensor() Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)))",
+                "",
+            )
+            for dataset in self.datasets
+        )
+
+    def set_epoch(self, epoch):
+        for dataset in self.datasets:
+            dataset.set_epoch(epoch)
+
+    def __getitem__(self, idx):
+        other = None
+        if isinstance(idx, tuple):
+            idx, other, another = idx
+
+        if not (0 <= idx < len(self)):
+            raise IndexError()
+
+        db_idx = np.searchsorted(self._cum_sizes, idx, "right")
+        dataset = self.datasets[db_idx]
+        new_idx = idx - (self._cum_sizes[db_idx - 1] if db_idx > 0 else 0)
+
+        if other is not None and another is not None:
+            new_idx = (new_idx, other, another)
+        return dataset[new_idx]
+
+    @property
+    def _resolutions(self):
+        resolutions = self.datasets[0]._resolutions
+        for dataset in self.datasets[1:]:
+            assert tuple(dataset._resolutions) == tuple(resolutions)
+        return resolutions
+
+    @property
+    def num_views(self):
+        num_views = self.datasets[0].num_views
+        for dataset in self.datasets[1:]:
+            assert dataset.num_views == num_views
+        return num_views
diff --git a/extern/CUT3R/src/dust3r/datasets/bedlam.py b/extern/CUT3R/src/dust3r/datasets/bedlam.py
new file mode 100644
index 0000000000000000000000000000000000000000..f680a29fd8b446d30db51d531a939de5abf9e521
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/bedlam.py
@@ -0,0 +1,297 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+invalid_seqs = [
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000042",
+    "20221024_10_100_batch01handhair_zoom_suburb_d_seq_000059",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000079",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000978",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000081",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000268",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000089",
+    "20221013_3_250_batch01hand_orbit_bigOffice_seq_000189",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000034",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000889",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000293",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000067",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000904",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000434",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000044",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000013",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000396",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000012",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000082",
+    "20221013_3_250_batch01hand_orbit_bigOffice_seq_000120",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000324",
+    "20221013_3_250_batch01hand_static_bigOffice_seq_000038",
+    "20221012_3-10_500_batch01hand_zoom_highSchoolGym_seq_000486",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000421",
+    "20221013_3_250_batch01hand_orbit_bigOffice_seq_000226",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000012",
+    "20221013_3_250_batch01hand_orbit_bigOffice_seq_000149",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000311",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000080",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000122",
+    "20221012_3-10_500_batch01hand_zoom_highSchoolGym_seq_000079",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000077",
+    "20221014_3_250_batch01hand_orbit_archVizUI3_time15_seq_000095",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000062",
+    "20221013_3_250_batch01hand_static_bigOffice_seq_000015",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000095",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000119",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000297",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000011",
+    "20221013_3_250_batch01hand_orbit_bigOffice_seq_000196",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000316",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000283",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000085",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000287",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000163",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000804",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000842",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000027",
+    "20221013_3_250_batch01hand_orbit_bigOffice_seq_000182",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000982",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000029",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000031",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000025",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000250",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000785",
+    "20221024_10_100_batch01handhair_zoom_suburb_d_seq_000069",
+    "20221013_3_250_batch01hand_orbit_bigOffice_seq_000122",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000246",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000352",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000425",
+    "20221013_3_250_batch01hand_orbit_bigOffice_seq_000192",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000900",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000043",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000063",
+    "20221014_3_250_batch01hand_orbit_archVizUI3_time15_seq_000096",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000091",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000013",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000309",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000114",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000969",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000361",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000267",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000083",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000383",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000890",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000003",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000045",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000317",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000076",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000082",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000907",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000279",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000076",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000004",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000061",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000811",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000800",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000841",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000794",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000308",
+    "20221024_10_100_batch01handhair_zoom_suburb_d_seq_000064",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000284",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000752",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000269",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000036",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000419",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000290",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000322",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000818",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000327",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000326",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000002",
+    "20221024_10_100_batch01handhair_zoom_suburb_d_seq_000060",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000348",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000059",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000016",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000817",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000332",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000094",
+    "20221013_3_250_batch01hand_orbit_bigOffice_seq_000193",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000779",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000177",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000368",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000023",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000024",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000310",
+    "20221014_3_250_batch01hand_orbit_archVizUI3_time15_seq_000086",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000038",
+    "20221024_10_100_batch01handhair_zoom_suburb_d_seq_000071",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000768",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000017",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000053",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000097",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000856",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000827",
+    "20221013_3_250_batch01hand_orbit_bigOffice_seq_000161",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000084",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000106",
+    "20221013_3_250_batch01hand_orbit_bigOffice_seq_000207",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000007",
+    "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000013",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000251",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000796",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000105",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000251",
+    "20221019_3-8_250_highbmihand_orbit_stadium_seq_000046",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000334",
+    "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000453",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000373",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000283",
+    "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000249",
+]
+hdri_scenes = [
+    "20221010_3_1000_batch01hand",
+    "20221017_3_1000_batch01hand",
+    "20221018_3-8_250_batch01hand",
+    "20221019_3_250_highbmihand",
+]
+
+
+class BEDLAM_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.pose_root = os.path.join(
+            os.path.dirname(ROOT), f"{os.path.basename(ROOT)}_pose"
+        )
+        assert os.path.exists(self.pose_root)
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 4
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        self.scenes = os.listdir(self.ROOT)
+
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+
+        j = 0
+        for scene in tqdm(self.scenes):
+            if scene in invalid_seqs:
+                continue
+            if any([scene.startswith(x) for x in hdri_scenes]):
+                continue
+            if "closeup" in scene:
+                continue
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")]
+            )
+            num_imgs = len(basenames)
+            img_ids = list(np.arange(num_imgs) + offset)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+
+            start_img_ids.extend(start_img_ids_)
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+
+            # offset groups
+            offset += num_imgs
+            j += 1
+
+        self.scenes = scenes
+        assert len(set(self.scenes) - set(os.listdir(self.pose_root))) == 0
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            video_prob=1.0,
+            fix_interval_prob=1.0,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+            rgb_dir = osp.join(scene_dir, "rgb")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(osp.join(self.pose_root, self.scenes[scene_id]), "cam")
+
+            basename = self.images[view_idx]
+
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png"))
+            # Load depthmap
+            depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+            depthmap[depthmap > 200.0] = 0.0
+
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            camera_pose = cam["pose"]
+            intrinsics = cam["intrinsics"]
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.85, 0.10, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="BEDLAM",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=osp.join(rgb_dir, basename + ".png"),
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(1, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/blendedmvs.py b/extern/CUT3R/src/dust3r/datasets/blendedmvs.py
new file mode 100755
index 0000000000000000000000000000000000000000..e9c290bbc1a6c0535e2267dc4e5eb0ecc62b6019
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/blendedmvs.py
@@ -0,0 +1,305 @@
+import os.path as osp
+import numpy as np
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+import h5py
+from tqdm import tqdm
+
+
+class BlendedMVS_Multi(BaseMultiViewDataset):
+    """Dataset of outdoor street scenes, 5 images each time"""
+
+    def __init__(self, *args, ROOT, split=None, **kwargs):
+        self.ROOT = ROOT
+        self.video = False
+        self.is_metric = False
+        super().__init__(*args, **kwargs)
+        # assert split is None
+        self._load_data()
+
+    def _load_data(self):
+        self.data_dict = self.read_h5_file(os.path.join(self.ROOT, "new_overlap.h5"))
+        self.num_imgs = sum(
+            [len(self.data_dict[s]["basenames"]) for s in self.data_dict.keys()]
+        )
+        self.num_scenes = len(self.data_dict.keys())
+        self.invalid_scenes = []
+        self.is_reachable_cache = {scene: {} for scene in self.data_dict.keys()}
+
+    def read_h5_file(self, h5_file_path):
+        data_dict = {}
+        self.all_ref_imgs = []
+        with h5py.File(h5_file_path, "r") as f:
+            for scene_dir in tqdm(f.keys()):
+                group = f[scene_dir]
+                basenames = group["basenames"][:]
+                indices = group["indices"][:]
+                values = group["values"][:]
+                shape = group.attrs["shape"]
+                # Reconstruct the sparse matrix
+                score_matrix = np.zeros(shape, dtype=np.float32)
+                score_matrix[indices[0], indices[1]] = values
+                data_dict[scene_dir] = {
+                    "basenames": basenames,
+                    "score_matrix": self.build_adjacency_list(score_matrix),
+                }
+                self.all_ref_imgs.extend(
+                    [(scene_dir, b) for b in range(len(basenames))]
+                )
+        return data_dict
+
+    @staticmethod
+    def build_adjacency_list(S, thresh=0.2):
+        adjacency_list = [[] for _ in range(len(S))]
+        S = S - thresh
+        S[S < 0] = 0
+        rows, cols = np.nonzero(S)
+        for i, j in zip(rows, cols):
+            adjacency_list[i].append((j, S[i][j]))
+        return adjacency_list
+
+    @staticmethod
+    def is_reachable(adjacency_list, start_index, k):
+        visited = set()
+        stack = [start_index]
+        while stack and len(visited) < k:
+            node = stack.pop()
+            if node not in visited:
+                visited.add(node)
+                for neighbor in adjacency_list[node]:
+                    if neighbor[0] not in visited:
+                        stack.append(neighbor[0])
+        return len(visited) >= k
+
+    @staticmethod
+    def random_sequence_no_revisit_with_backtracking(
+        adjacency_list, k, start_index, rng: np.random.Generator
+    ):
+        path = [start_index]
+        visited = set([start_index])
+
+        neighbor_iterators = []
+        # Initialize the iterator for the start index
+        neighbors = adjacency_list[start_index]
+        neighbor_idxs = [n[0] for n in neighbors]
+        neighbor_weights = [n[1] for n in neighbors]
+        neighbor_idxs = rng.choice(
+            neighbor_idxs,
+            size=len(neighbor_idxs),
+            replace=False,
+            p=np.array(neighbor_weights) / np.sum(neighbor_weights),
+        ).tolist()
+        neighbor_iterators.append(iter(neighbor_idxs))
+
+        while len(path) < k:
+            if not neighbor_iterators:
+                # No possible sequence
+                return None
+            current_iterator = neighbor_iterators[-1]
+            try:
+                next_index = next(current_iterator)
+                if next_index not in visited:
+                    path.append(next_index)
+                    visited.add(next_index)
+
+                    # Prepare iterator for the next node
+                    neighbors = adjacency_list[next_index]
+                    neighbor_idxs = [n[0] for n in neighbors]
+                    neighbor_weights = [n[1] for n in neighbors]
+                    neighbor_idxs = rng.choice(
+                        neighbor_idxs,
+                        size=len(neighbor_idxs),
+                        replace=False,
+                        p=np.array(neighbor_weights) / np.sum(neighbor_weights),
+                    ).tolist()
+                    neighbor_iterators.append(iter(neighbor_idxs))
+            except StopIteration:
+                # No more neighbors to try at this node, backtrack
+                neighbor_iterators.pop()
+                visited.remove(path.pop())
+        return path
+
+    @staticmethod
+    def random_sequence_with_optional_repeats(
+        adjacency_list,
+        k,
+        start_index,
+        rng: np.random.Generator,
+        max_k=None,
+        max_attempts=100,
+    ):
+        if max_k is None:
+            max_k = k
+        path = [start_index]
+        visited = set([start_index])
+        current_index = start_index
+        attempts = 0
+
+        while len(path) < max_k and attempts < max_attempts:
+            attempts += 1
+            neighbors = adjacency_list[current_index]
+            neighbor_idxs = [n[0] for n in neighbors]
+            neighbor_weights = [n[1] for n in neighbors]
+
+            if not neighbor_idxs:
+                # No neighbors, cannot proceed further
+                break
+
+            # Try to find unvisited neighbors
+            unvisited_neighbors = [
+                (idx, wgt)
+                for idx, wgt in zip(neighbor_idxs, neighbor_weights)
+                if idx not in visited
+            ]
+            if unvisited_neighbors:
+                # Select among unvisited neighbors
+                unvisited_idxs = [idx for idx, _ in unvisited_neighbors]
+                unvisited_weights = [wgt for _, wgt in unvisited_neighbors]
+                probabilities = np.array(unvisited_weights) / np.sum(unvisited_weights)
+                next_index = rng.choice(unvisited_idxs, p=probabilities)
+                visited.add(next_index)
+            else:
+                # All neighbors visited, but we need to reach length max_k
+                # So we can revisit nodes
+                probabilities = np.array(neighbor_weights) / np.sum(neighbor_weights)
+                next_index = rng.choice(neighbor_idxs, p=probabilities)
+
+            path.append(next_index)
+            current_index = next_index
+
+        if len(set(path)) >= k:
+            # If path is shorter than max_k, extend it by repeating existing elements
+            while len(path) < max_k:
+                # Randomly select nodes from the existing path to repeat
+                next_index = rng.choice(path)
+                path.append(next_index)
+            return path
+        else:
+            # Could not reach k unique nodes
+            return None
+
+    def __len__(self):
+        return len(self.all_ref_imgs)
+
+    def get_image_num(self):
+        return self.num_imgs
+
+    def get_stats(self):
+        return f"{len(self)} imgs from {self.num_scenes} scenes"
+
+    def generate_sequence(
+        self, scene, adj_list, num_views, start_index, rng, allow_repeat=False
+    ):
+        cutoff = num_views if not allow_repeat else max(num_views // 5, 3)
+        if start_index in self.is_reachable_cache[scene]:
+            if not self.is_reachable_cache[scene][start_index]:
+                print(
+                    f"Cannot reach {num_views} unique elements from index {start_index}."
+                )
+                return None
+        else:
+            self.is_reachable_cache[scene][start_index] = self.is_reachable(
+                adj_list, start_index, cutoff
+            )
+            if not self.is_reachable_cache[scene][start_index]:
+                print(
+                    f"Cannot reach {num_views} unique elements from index {start_index}."
+                )
+                return None
+        if not allow_repeat:
+            sequence = self.random_sequence_no_revisit_with_backtracking(
+                adj_list, cutoff, start_index, rng
+            )
+        else:
+            sequence = self.random_sequence_with_optional_repeats(
+                adj_list, cutoff, start_index, rng, max_k=num_views
+            )
+        if not sequence:
+            self.is_reachable_cache[scene][start_index] = False
+            print("Failed to generate a sequence without revisiting.")
+        return sequence
+
+    def _get_views(self, idx, resolution, rng: np.random.Generator, num_views):
+        scene_info, ref_img_idx = self.all_ref_imgs[idx]
+        invalid_seq = True
+        ordered_video = False
+
+        while invalid_seq:
+            basenames = self.data_dict[scene_info]["basenames"]
+            if (
+                sum(
+                    [
+                        (1 - int(x))
+                        for x in list(self.is_reachable_cache[scene_info].values())
+                    ]
+                )
+                > len(basenames) - self.num_views
+            ):
+                self.invalid_scenes.append(scene_info)
+            while scene_info in self.invalid_scenes:
+                idx = rng.integers(low=0, high=len(self.all_ref_imgs))
+                scene_info, ref_img_idx = self.all_ref_imgs[idx]
+                basenames = self.data_dict[scene_info]["basenames"]
+
+            score_matrix = self.data_dict[scene_info]["score_matrix"]
+            imgs_idxs = self.generate_sequence(
+                scene_info, score_matrix, num_views, ref_img_idx, rng, self.allow_repeat
+            )
+
+            if imgs_idxs is None:
+                random_direction = 2 * rng.choice(2) - 1
+                for offset in range(1, len(basenames)):
+                    tentative_im_idx = (
+                        ref_img_idx + (random_direction * offset)
+                    ) % len(basenames)
+                    if (
+                        tentative_im_idx not in self.is_reachable_cache[scene_info]
+                        or self.is_reachable_cache[scene_info][tentative_im_idx]
+                    ):
+                        ref_img_idx = tentative_im_idx
+                        break
+            else:
+                invalid_seq = False
+        views = []
+        for view_idx in imgs_idxs:
+            scene_dir = osp.join(self.ROOT, scene_info)
+            impath = basenames[view_idx].decode("utf-8")
+            image = imread_cv2(osp.join(scene_dir, impath + ".jpg"))
+            depthmap = imread_cv2(osp.join(scene_dir, impath + ".exr"))
+            camera_params = np.load(osp.join(scene_dir, impath + ".npz"))
+
+            intrinsics = np.float32(camera_params["intrinsics"])
+            camera_pose = np.eye(4, dtype=np.float32)
+            camera_pose[:3, :3] = camera_params["R_cam2world"]
+            camera_pose[:3, 3] = camera_params["t_cam2world"]
+
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image, depthmap, intrinsics, resolution, rng, info=(scene_dir, impath)
+            )
+
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=camera_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    dataset="BlendedMVS",
+                    label=osp.relpath(scene_dir, self.ROOT),
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    instance=osp.join(scene_dir, impath + ".jpg"),
+                    quantile=np.array(0.97, dtype=np.float32),
+                    img_mask=True,
+                    ray_mask=False,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/co3d.py b/extern/CUT3R/src/dust3r/datasets/co3d.py
new file mode 100755
index 0000000000000000000000000000000000000000..98dcc820fcd70fd496396ef000c22aeb2adee35a
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/co3d.py
@@ -0,0 +1,190 @@
+import os.path as osp
+import json
+import itertools
+from collections import deque
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+import cv2
+import numpy as np
+import time
+
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class Co3d_Multi(BaseMultiViewDataset):
+    def __init__(self, mask_bg="rand", *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        assert mask_bg in (True, False, "rand")
+        self.mask_bg = mask_bg
+        self.is_metric = False
+        self.dataset_label = "Co3d_v2"
+
+        # load all scenes
+        with open(osp.join(self.ROOT, f"selected_seqs_{self.split}.json"), "r") as f:
+            self.scenes = json.load(f)
+            self.scenes = {k: v for k, v in self.scenes.items() if len(v) > 0}
+            self.scenes = {
+                (k, k2): v2 for k, v in self.scenes.items() for k2, v2 in v.items()
+            }
+        self.scene_list = list(self.scenes.keys())
+        cut_off = (
+            self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+        )
+        self.cut_off = cut_off
+        self.all_ref_imgs = [
+            (key, value)
+            for key, values in self.scenes.items()
+            for value in values[: len(values) - cut_off + 1]
+        ]
+        self.invalidate = {scene: {} for scene in self.scene_list}
+        self.invalid_scenes = {scene: False for scene in self.scene_list}
+
+    def __len__(self):
+        return len(self.all_ref_imgs)
+
+    def _get_metadatapath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.npz")
+
+    def _get_impath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.jpg")
+
+    def _get_depthpath(self, obj, instance, view_idx):
+        return osp.join(
+            self.ROOT, obj, instance, "depths", f"frame{view_idx:06n}.jpg.geometric.png"
+        )
+
+    def _get_maskpath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, "masks", f"frame{view_idx:06n}.png")
+
+    def _read_depthmap(self, depthpath, input_metadata):
+        depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED)
+        depthmap = (depthmap.astype(np.float32) / 65535) * np.nan_to_num(
+            input_metadata["maximum_depth"]
+        )
+        return depthmap
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        invalid_seq = True
+        scene_info, ref_img_idx = self.all_ref_imgs[idx]
+
+        while invalid_seq:
+            while self.invalid_scenes[scene_info]:
+                idx = rng.integers(low=0, high=len(self.all_ref_imgs))
+                scene_info, ref_img_idx = self.all_ref_imgs[idx]
+
+            obj, instance = scene_info
+
+            image_pool = self.scenes[obj, instance]
+            if len(image_pool) < self.cut_off:
+                print("Invalid scene!")
+                self.invalid_scenes[scene_info] = True
+                continue
+
+            imgs_idxs, ordered_video = self.get_seq_from_start_id(
+                num_views, ref_img_idx, image_pool, rng
+            )
+
+            if resolution not in self.invalidate[obj, instance]:  # flag invalid images
+                self.invalidate[obj, instance][resolution] = [
+                    False for _ in range(len(image_pool))
+                ]
+            # decide now if we mask the bg
+            mask_bg = (self.mask_bg == True) or (
+                self.mask_bg == "rand" and rng.choice(2, p=[0.9, 0.1])
+            )
+            views = []
+
+            imgs_idxs = deque(imgs_idxs)
+
+            while len(imgs_idxs) > 0:  # some images (few) have zero depth
+                if (
+                    len(image_pool) - sum(self.invalidate[obj, instance][resolution])
+                    < self.cut_off
+                ):
+                    print("Invalid scene!")
+                    invalid_seq = True
+                    self.invalid_scenes[scene_info] = True
+                    break
+
+                im_idx = imgs_idxs.pop()
+                if self.invalidate[obj, instance][resolution][im_idx]:
+                    # search for a valid image
+                    ordered_video = False
+                    random_direction = 2 * rng.choice(2) - 1
+                    for offset in range(1, len(image_pool)):
+                        tentative_im_idx = (im_idx + (random_direction * offset)) % len(
+                            image_pool
+                        )
+                        if not self.invalidate[obj, instance][resolution][
+                            tentative_im_idx
+                        ]:
+                            im_idx = tentative_im_idx
+                            break
+                view_idx = image_pool[im_idx]
+                impath = self._get_impath(obj, instance, view_idx)
+                depthpath = self._get_depthpath(obj, instance, view_idx)
+
+                # load camera params
+                metadata_path = self._get_metadatapath(obj, instance, view_idx)
+                input_metadata = np.load(metadata_path)
+                camera_pose = input_metadata["camera_pose"].astype(np.float32)
+                intrinsics = input_metadata["camera_intrinsics"].astype(np.float32)
+
+                # load image and depth
+                rgb_image = imread_cv2(impath)
+                depthmap = self._read_depthmap(depthpath, input_metadata)
+
+                if mask_bg:
+                    # load object mask
+                    maskpath = self._get_maskpath(obj, instance, view_idx)
+                    maskmap = imread_cv2(maskpath, cv2.IMREAD_UNCHANGED).astype(
+                        np.float32
+                    )
+                    maskmap = (maskmap / 255.0) > 0.1
+
+                    # update the depthmap with mask
+                    depthmap *= maskmap
+                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                    rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath
+                )
+                num_valid = (depthmap > 0.0).sum()
+                if num_valid == 0:
+                    # problem, invalidate image and retry
+                    self.invalidate[obj, instance][resolution][im_idx] = True
+                    imgs_idxs.append(im_idx)
+                    continue
+
+                # generate img mask and raymap mask
+                img_mask, ray_mask = self.get_img_and_ray_masks(
+                    self.is_metric, len(views), rng
+                )
+
+                views.append(
+                    dict(
+                        img=rgb_image,
+                        depthmap=depthmap,
+                        camera_pose=camera_pose,
+                        camera_intrinsics=intrinsics,
+                        dataset=self.dataset_label,
+                        label=osp.join(obj, instance),
+                        instance=osp.split(impath)[1],
+                        is_metric=self.is_metric,
+                        is_video=ordered_video,
+                        quantile=np.array(0.9, dtype=np.float32),
+                        img_mask=img_mask,
+                        ray_mask=ray_mask,
+                        camera_only=False,
+                        depth_only=False,
+                        single_view=False,
+                        reset=False,
+                    )
+                )
+
+            if len(views) == num_views and not all(
+                [view["instance"] == views[0]["instance"] for view in views]
+            ):
+                invalid_seq = False
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/cop3d.py b/extern/CUT3R/src/dust3r/datasets/cop3d.py
new file mode 100755
index 0000000000000000000000000000000000000000..aa93c7d109f80d70869250b8a44daf59cf202e0f
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/cop3d.py
@@ -0,0 +1,110 @@
+import os.path as osp
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+import cv2
+import numpy as np
+
+from dust3r.datasets.co3d import Co3d_Multi
+from dust3r.utils.image import imread_cv2
+
+
+class Cop3D_Multi(Co3d_Multi):
+    def __init__(self, mask_bg="rand", *args, ROOT, **kwargs):
+        super().__init__(mask_bg, *args, ROOT=ROOT, **kwargs)
+        self.dataset_label = "Cop3D"
+        self.is_metric = False
+
+    def _get_metadatapath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.npz")
+
+    def _get_impath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.jpg")
+
+    def _get_depthpath(self, obj, instance, view_idx):
+        # no depth, pseduo path just for getting the right resolution
+        return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.jpg")
+
+    def _get_maskpath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, "masks", f"frame{view_idx:06n}.png")
+
+    def _read_depthmap(self, impath, input_metadata):
+        # no depth, set to all ones
+        img = imread_cv2(impath, cv2.IMREAD_UNCHANGED)
+        depthmap = np.ones_like(img[..., 0], dtype=np.float32)
+        return depthmap
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        invalid_seq = True
+        scene_info, ref_img_idx = self.all_ref_imgs[idx]
+
+        while invalid_seq:
+            while self.invalid_scenes[scene_info]:
+                idx = rng.integers(low=0, high=len(self.all_ref_imgs))
+                scene_info, ref_img_idx = self.all_ref_imgs[idx]
+
+            obj, instance = scene_info
+
+            image_pool = self.scenes[obj, instance]
+            if len(image_pool) < self.num_views:
+                print("Invalid scene!")
+                self.invalid_scenes[scene_info] = True
+                continue
+
+            imgs_idxs, ordered_video = self.get_seq_from_start_id(
+                num_views,
+                ref_img_idx,
+                image_pool,
+                rng,
+                max_interval=5,
+                video_prob=1.0,
+                fix_interval_prob=0.9,
+            )
+
+            views = []
+
+            for im_idx in imgs_idxs:
+                view_idx = image_pool[im_idx]
+                impath = self._get_impath(obj, instance, view_idx)
+                depthpath = self._get_depthpath(obj, instance, view_idx)
+
+                # load camera params
+                metadata_path = self._get_metadatapath(obj, instance, view_idx)
+                input_metadata = np.load(metadata_path)
+                camera_pose = input_metadata["camera_pose"].astype(np.float32)
+                intrinsics = input_metadata["camera_intrinsics"].astype(np.float32)
+
+                # load image and depth
+                rgb_image = imread_cv2(impath)
+                depthmap = self._read_depthmap(depthpath, input_metadata)
+
+                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                    rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath
+                )
+
+                views.append(
+                    dict(
+                        img=rgb_image,
+                        depthmap=depthmap,
+                        camera_pose=camera_pose,
+                        camera_intrinsics=intrinsics,
+                        dataset=self.dataset_label,
+                        label=osp.join(obj, instance),
+                        instance=osp.split(impath)[1],
+                        is_metric=self.is_metric,
+                        is_video=ordered_video,
+                        quantile=np.array(0.96, dtype=np.float32),
+                        img_mask=True,
+                        ray_mask=False,
+                        camera_only=True,
+                        depth_only=False,
+                        single_view=False,
+                        reset=False,
+                    )
+                )
+
+            if len(views) == num_views and not all(
+                [view["instance"] == views[0]["instance"] for view in views]
+            ):
+                invalid_seq = False
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/dl3dv.py b/extern/CUT3R/src/dust3r/datasets/dl3dv.py
new file mode 100644
index 0000000000000000000000000000000000000000..2650d573123b86f10c99bb663ec399372808fe37
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/dl3dv.py
@@ -0,0 +1,166 @@
+import os.path as osp
+import os
+import sys
+import itertools
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+import cv2
+import numpy as np
+
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class DL3DV_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, split, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.max_interval = 20
+        self.is_metric = False
+        super().__init__(*args, **kwargs)
+
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        self.all_scenes = sorted(
+            [f for f in os.listdir(self.ROOT) if os.path.isdir(osp.join(self.ROOT, f))]
+        )
+        subscenes = []
+        for scene in self.all_scenes:
+            # not empty
+            subscenes.extend(
+                [
+                    osp.join(scene, f)
+                    for f in os.listdir(osp.join(self.ROOT, scene))
+                    if os.path.isdir(osp.join(self.ROOT, scene, f))
+                    and len(os.listdir(osp.join(self.ROOT, scene, f))) > 0
+                ]
+            )
+
+        offset = 0
+        scenes = []
+        sceneids = []
+        images = []
+        scene_img_list = []
+        start_img_ids = []
+        j = 0
+
+        for scene_idx, scene in enumerate(subscenes):
+            scene_dir = osp.join(self.ROOT, scene, "dense")
+            rgb_paths = sorted(
+                [
+                    f
+                    for f in os.listdir(os.path.join(scene_dir, "rgb"))
+                    if f.endswith(".png")
+                ]
+            )
+            assert len(rgb_paths) > 0, f"{scene_dir} is empty."
+            num_imgs = len(rgb_paths)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+
+            img_ids = list(np.arange(num_imgs) + offset)
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+            sceneids.extend([j] * num_imgs)
+            images.extend(rgb_paths)
+            start_img_ids.extend(start_img_ids_)
+            offset += num_imgs
+            j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        scene_id = self.sceneids[start_id]
+        all_image_ids = self.scene_img_list[scene_id]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            block_shuffle=25,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+
+        views = []
+        for view_idx in image_idxs:
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id], "dense")
+
+            rgb_path = self.images[view_idx]
+            basename = rgb_path[:-4]
+
+            rgb_image = imread_cv2(
+                osp.join(scene_dir, "rgb", rgb_path), cv2.IMREAD_COLOR
+            )
+            depthmap = np.load(osp.join(scene_dir, "depth", basename + ".npy")).astype(
+                np.float32
+            )
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+            cam_file = np.load(osp.join(scene_dir, "cam", basename + ".npz"))
+            sky_mask = (
+                cv2.imread(
+                    osp.join(scene_dir, "sky_mask", rgb_path), cv2.IMREAD_UNCHANGED
+                )
+                >= 127
+            )
+            outlier_mask = cv2.imread(
+                osp.join(scene_dir, "outlier_mask", rgb_path), cv2.IMREAD_UNCHANGED
+            )
+            depthmap[sky_mask] = -1.0
+            depthmap[outlier_mask >= 127] = 0.0
+            depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
+            threshold = (
+                np.percentile(depthmap[depthmap > 0], 98)
+                if depthmap[depthmap > 0].size > 0
+                else 0
+            )
+            depthmap[depthmap > threshold] = 0.0
+
+            intrinsics = cam_file["intrinsic"].astype(np.float32)
+            camera_pose = cam_file["pose"].astype(np.float32)
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="dl3dv",
+                    label=self.scenes[scene_id] + "_" + rgb_path,
+                    instance=osp.join(scene_dir, "rgb", rgb_path),
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(0.9, dtype=np.float32),
+                    img_mask=True,
+                    ray_mask=False,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/dynamic_replica.py b/extern/CUT3R/src/dust3r/datasets/dynamic_replica.py
new file mode 100755
index 0000000000000000000000000000000000000000..1d816e58be6518e1274fa84fa8c6a7cae73741ca
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/dynamic_replica.py
@@ -0,0 +1,137 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class DynamicReplica(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 16
+        super().__init__(*args, **kwargs)
+
+        self.loaded_data = self._load_data(self.split)
+
+    def _load_data(self, split):
+        self.scenes = os.listdir(os.path.join(self.ROOT, split))
+
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.ROOT, self.split, scene, "left")
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")],
+                key=lambda x: float(x),
+            )
+            num_imgs = len(basenames)
+            img_ids = list(np.arange(num_imgs) + offset)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+            start_img_ids.extend(start_img_ids_)
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+
+            # offset groups
+            offset += num_imgs
+            j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            video_prob=1.0,
+            fix_interval_prob=1.0,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.split, self.scenes[scene_id], "left")
+            rgb_dir = osp.join(scene_dir, "rgb")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(scene_dir, "cam")
+
+            basename = self.images[view_idx]
+
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png"))
+            # Load depthmap
+            depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            camera_pose = cam["pose"]
+            intrinsics = cam["intrinsics"]
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.85, 0.10, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="dynamic_replica",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/eden.py b/extern/CUT3R/src/dust3r/datasets/eden.py
new file mode 100644
index 0000000000000000000000000000000000000000..00af2fffc73535f436557929b1b0220737907b2b
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/eden.py
@@ -0,0 +1,94 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class EDEN_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        scenes = os.listdir(self.ROOT)
+        img_names = []
+        for scene in scenes:
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")]
+            )
+            img_names.extend([(scene, basename) for basename in basenames])
+
+        self.img_names = img_names
+
+    def __len__(self):
+        return len(self.img_names)
+
+    def get_image_num(self):
+        return len(self.img_names)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        new_seed = rng.integers(0, 2**32) + idx
+        new_rng = np.random.default_rng(new_seed)
+        img_names = new_rng.permutation(self.img_names)
+
+        views = []
+        i = 0
+        while len(views) < num_views:
+            # Load RGB image
+            scene, img_name = img_names[i]
+            try:
+                rgb_image = imread_cv2(
+                    osp.join(self.ROOT, scene, "rgb", f"{img_name}.png")
+                )
+                depthmap = np.load(
+                    osp.join(self.ROOT, scene, "depth", f"{img_name}.npy")
+                )
+                depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
+
+                intrinsics = np.load(
+                    osp.join(self.ROOT, scene, "cam", f"{img_name}.npz")
+                )["intrinsics"]
+                # camera pose is not provided, placeholder
+                camera_pose = np.eye(4)
+            except:
+                i += 1
+                continue
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=img_name
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="EDEN",
+                    label=img_name,
+                    instance=osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"),
+                    is_metric=self.is_metric,
+                    is_video=False,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=True,
+                    ray_mask=False,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=True,
+                    reset=True,
+                )
+            )
+            i += 1
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/hoi4d.py b/extern/CUT3R/src/dust3r/datasets/hoi4d.py
new file mode 100644
index 0000000000000000000000000000000000000000..b602df5d4dd1493d02377039379fd2ffb3b08ba2
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/hoi4d.py
@@ -0,0 +1,84 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+sys.path.append(osp.join(osp.dirname(__file__), '..','..'))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class HOI4D_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        scenes = os.listdir(self.ROOT)
+        img_names = []
+        for scene in scenes:
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, 'rgb')
+            basenames = sorted([f[:-4] for f in os.listdir(rgb_dir) if f.endswith('.png')])
+            img_names.extend([(scene, basename) for basename in basenames])
+
+        self.img_names = img_names
+         
+    def __len__(self):
+        return len(self.img_names)
+
+    def get_image_num(self):
+        return len(self.img_names)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        new_seed = rng.integers(0, 2**32) + idx
+        new_rng = np.random.default_rng(new_seed)
+        invalid_seq = True
+        while invalid_seq:
+            img_names = new_rng.choice(self.img_names, num_views, replace=False)
+
+            views = []
+            for v, img_name in enumerate(img_names):
+                # Load RGB image
+                scene, img_name = img_name
+                try:
+                    rgb_image = imread_cv2(osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"))
+                    depthmap = np.load(osp.join(self.ROOT, scene, "depth", f"{img_name}.npy"))
+                    depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
+
+                    intrinsics = np.load(osp.join(self.ROOT, scene, "cam", f"{img_name}.npz"))["intrinsics"]
+                except:
+                    print(f"Error loading {scene} {img_name}, skipping")
+                    break
+                # camera pose is not provided, placeholder
+                camera_pose = np.eye(4) 
+
+                rgb_image, depthmap, intrinsics= self._crop_resize_if_necessary(
+                    rgb_image, depthmap, intrinsics, resolution, rng=rng, info=img_name)
+
+                views.append(dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset='HOI4D',
+                    label=img_name,
+                    instance=osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"),
+                    is_metric=self.is_metric,
+                    is_video=False,
+                    quantile=np.array(0.99, dtype=np.float32),
+                    img_mask=True,
+                    ray_mask=False,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=True,
+                    reset=True,
+                ))
+            if len(views) == num_views:
+                invalid_seq = False
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/hypersim.py b/extern/CUT3R/src/dust3r/datasets/hypersim.py
new file mode 100755
index 0000000000000000000000000000000000000000..c194df6db72525f2f164297dd4198a27085ce95c
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/hypersim.py
@@ -0,0 +1,141 @@
+import os.path as osp
+import os
+import sys
+import itertools
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+import cv2
+import numpy as np
+
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class HyperSim_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, split, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 4
+        super().__init__(*args, **kwargs)
+
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        self.all_scenes = sorted(
+            [f for f in os.listdir(self.ROOT) if os.path.isdir(osp.join(self.ROOT, f))]
+        )
+        subscenes = []
+        for scene in self.all_scenes:
+            # not empty
+            subscenes.extend(
+                [
+                    osp.join(scene, f)
+                    for f in os.listdir(osp.join(self.ROOT, scene))
+                    if os.path.isdir(osp.join(self.ROOT, scene, f))
+                    and len(os.listdir(osp.join(self.ROOT, scene, f))) > 0
+                ]
+            )
+
+        offset = 0
+        scenes = []
+        sceneids = []
+        images = []
+        start_img_ids = []
+        scene_img_list = []
+        j = 0
+        for scene_idx, scene in enumerate(subscenes):
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_paths = sorted([f for f in os.listdir(scene_dir) if f.endswith(".png")])
+            assert len(rgb_paths) > 0, f"{scene_dir} is empty."
+            num_imgs = len(rgb_paths)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+            img_ids = list(np.arange(num_imgs) + offset)
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+            sceneids.extend([j] * num_imgs)
+            images.extend(rgb_paths)
+            start_img_ids.extend(start_img_ids_)
+            offset += num_imgs
+            j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.scene_img_list = scene_img_list
+        self.start_img_ids = start_img_ids
+
+    def __len__(self):
+        return len(self.start_img_ids) * 10
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        idx = idx // 10
+        start_id = self.start_img_ids[idx]
+        scene_id = self.sceneids[start_id]
+        all_image_ids = self.scene_img_list[scene_id]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            block_shuffle=16,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+
+            rgb_path = self.images[view_idx]
+            depth_path = rgb_path.replace("rgb.png", "depth.npy")
+            cam_path = rgb_path.replace("rgb.png", "cam.npz")
+
+            rgb_image = imread_cv2(osp.join(scene_dir, rgb_path), cv2.IMREAD_COLOR)
+            depthmap = np.load(osp.join(scene_dir, depth_path)).astype(np.float32)
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+            cam_file = np.load(osp.join(scene_dir, cam_path))
+            intrinsics = cam_file["intrinsics"].astype(np.float32)
+            camera_pose = cam_file["pose"].astype(np.float32)
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="hypersim",
+                    label=self.scenes[scene_id] + "_" + rgb_path,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/irs.py b/extern/CUT3R/src/dust3r/datasets/irs.py
new file mode 100644
index 0000000000000000000000000000000000000000..52baa76d6f6a952dc5fa69aeab6b45239cc6b549
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/irs.py
@@ -0,0 +1,86 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class IRS(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = False
+        self.is_metric = True
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        scenes = os.listdir(self.ROOT)
+        img_names = []
+        for scene in scenes:
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")]
+            )
+            img_names.extend([(scene, basename) for basename in basenames])
+
+        self.img_names = img_names
+
+    def __len__(self):
+        return len(self.img_names)
+
+    def get_image_num(self):
+        return len(self.img_names)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        new_seed = rng.integers(0, 2**32) + idx
+        new_rng = np.random.default_rng(new_seed)
+        img_names = new_rng.choice(self.img_names, num_views, replace=False)
+
+        views = []
+        for v, img_name in enumerate(img_names):
+            # Load RGB image
+            scene, img_name = img_name
+            rgb_image = imread_cv2(osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"))
+            depthmap = np.load(osp.join(self.ROOT, scene, "depth", f"{img_name}.npy"))
+            depthmap[depthmap > 200] = 0.0
+            depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
+
+            intrinsics = np.load(osp.join(self.ROOT, scene, "cam", f"{img_name}.npz"))[
+                "intrinsics"
+            ]
+            # camera pose is not provided, placeholder
+            camera_pose = np.eye(4)
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=img_name
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="irs",
+                    label=img_name,
+                    instance=f"{str(idx)}_{img_name}",
+                    is_metric=self.is_metric,
+                    is_video=False,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=True,
+                    ray_mask=False,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=True,
+                    reset=True,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/mapfree.py b/extern/CUT3R/src/dust3r/datasets/mapfree.py
new file mode 100644
index 0000000000000000000000000000000000000000..58eef2f61642deeca4e7accb84429f3d471a5bd9
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/mapfree.py
@@ -0,0 +1,282 @@
+import os.path as osp
+import numpy as np
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+import pickle
+import h5py
+from tqdm import tqdm
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class MapFree_Multi(BaseMultiViewDataset):
+
+    def __init__(self, ROOT, *args, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 30
+        super().__init__(*args, **kwargs)
+
+        self._load_data()
+
+    def imgid2path(self, img_id, scene):
+        first_seq_id, first_frame_id = img_id
+        return os.path.join(
+            self.ROOT,
+            scene,
+            f"dense{first_seq_id}",
+            "rgb",
+            f"frame_{first_frame_id:05d}.jpg",
+        )
+
+    def path2imgid(self, subscene, filename):
+        first_seq_id = int(subscene[5:])
+        first_frame_id = int(filename[6:-4])
+        return [first_seq_id, first_frame_id]
+
+    def _load_data(self):
+        cache_file = f"{self.ROOT}/cached_metadata_50_col_only.h5"
+        if os.path.exists(cache_file):
+            print(f"Loading cached metadata from {cache_file}")
+            with h5py.File(cache_file, "r") as hf:
+                self.scenes = list(map(lambda x: x.decode("utf-8"), hf["scenes"][:]))
+                self.sceneids = hf["sceneids"][:]
+                self.scope = hf["scope"][:]
+                self.video_flags = hf["video_flags"][:]
+                self.groups = hf["groups"][:]
+                self.id_ranges = hf["id_ranges"][:]
+                self.images = hf["images"][:]
+        else:
+            scene_dirs = sorted(
+                [
+                    d
+                    for d in os.listdir(self.ROOT)
+                    if os.path.isdir(os.path.join(self.ROOT, d))
+                ]
+            )
+            scenes = []
+            sceneids = []
+            groups = []
+            scope = []
+            images = []
+            id_ranges = []
+            is_video = []
+            start = 0
+            j = 0
+            offset = 0
+
+            for scene in tqdm(scene_dirs):
+                scenes.append(scene)
+                # video sequences
+                subscenes = sorted(
+                    [
+                        d
+                        for d in os.listdir(os.path.join(self.ROOT, scene))
+                        if d.startswith("dense")
+                    ]
+                )
+                id_range_subscenes = []
+                for subscene in subscenes:
+                    rgb_paths = sorted(
+                        [
+                            d
+                            for d in os.listdir(
+                                os.path.join(self.ROOT, scene, subscene, "rgb")
+                            )
+                            if d.endswith(".jpg")
+                        ]
+                    )
+                    assert (
+                        len(rgb_paths) > 0
+                    ), f"{os.path.join(self.ROOT, scene, subscene)} is empty."
+                    num_imgs = len(rgb_paths)
+                    images.extend(
+                        [self.path2imgid(subscene, rgb_path) for rgb_path in rgb_paths]
+                    )
+                    id_range_subscenes.append((offset, offset + num_imgs))
+                    offset += num_imgs
+
+                # image collections
+                metadata = pickle.load(
+                    open(os.path.join(self.ROOT, scene, "metadata.pkl"), "rb")
+                )
+                ref_imgs = list(metadata.keys())
+                img_groups = []
+                for ref_img in ref_imgs:
+                    other_imgs = metadata[ref_img]
+                    if len(other_imgs) + 1 < self.num_views:
+                        continue
+                    group = [(*other_img[0], other_img[1]) for other_img in other_imgs]
+                    group.insert(0, (*ref_img, 1))
+                    img_groups.append(np.array(group))
+                    id_ranges.append(id_range_subscenes[ref_img[0]])
+                    scope.append(start)
+                    start = start + len(group)
+
+                num_groups = len(img_groups)
+                sceneids.extend([j] * num_groups)
+                groups.extend(img_groups)
+                is_video.extend([False] * num_groups)
+                j += 1
+
+            self.scenes = np.array(scenes)
+            self.sceneids = np.array(sceneids)
+            self.scope = np.array(scope)
+            self.video_flags = np.array(is_video)
+            self.groups = np.concatenate(groups, 0)
+            self.id_ranges = np.array(id_ranges)
+            self.images = np.array(images)
+
+            data = dict(
+                scenes=self.scenes,
+                sceneids=self.sceneids,
+                scope=self.scope,
+                video_flags=self.video_flags,
+                groups=self.groups,
+                id_ranges=self.id_ranges,
+                images=self.images,
+            )
+
+            with h5py.File(cache_file, "w") as h5f:
+                h5f.create_dataset(
+                    "scenes",
+                    data=data["scenes"].astype(object),
+                    dtype=h5py.string_dtype(encoding="utf-8"),
+                    compression="lzf",
+                    chunks=True,
+                )
+                h5f.create_dataset(
+                    "sceneids", data=data["sceneids"], compression="lzf", chunks=True
+                )
+                h5f.create_dataset(
+                    "scope", data=data["scope"], compression="lzf", chunks=True
+                )
+                h5f.create_dataset(
+                    "video_flags",
+                    data=data["video_flags"],
+                    compression="lzf",
+                    chunks=True,
+                )
+                h5f.create_dataset(
+                    "groups", data=data["groups"], compression="lzf", chunks=True
+                )
+                h5f.create_dataset(
+                    "id_ranges", data=data["id_ranges"], compression="lzf", chunks=True
+                )
+                h5f.create_dataset(
+                    "images", data=data["images"], compression="lzf", chunks=True
+                )
+
+    def __len__(self):
+        return len(self.scope)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def get_stats(self):
+        return f"{len(self)} groups of views"
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        scene = self.scenes[self.sceneids[idx]]
+        if rng.random() < 0.6:
+            ids = np.arange(self.id_ranges[idx][0], self.id_ranges[idx][1])
+            cut_off = num_views if not self.allow_repeat else max(num_views // 3, 3)
+            start_ids = ids[: len(ids) - cut_off + 1]
+            start_id = rng.choice(start_ids)
+            pos, ordered_video = self.get_seq_from_start_id(
+                num_views,
+                start_id,
+                ids.tolist(),
+                rng,
+                max_interval=self.max_interval,
+                video_prob=0.8,
+                fix_interval_prob=0.5,
+                block_shuffle=16,
+            )
+            ids = np.array(ids)[pos]
+            image_idxs = self.images[ids]
+        else:
+            ordered_video = False
+            seq_start_index = self.scope[idx]
+            seq_end_index = self.scope[idx + 1] if idx < len(self.scope) - 1 else None
+            image_idxs = (
+                self.groups[seq_start_index:seq_end_index]
+                if seq_end_index is not None
+                else self.groups[seq_start_index:]
+            )
+            image_idxs, overlap_scores = image_idxs[:, :2], image_idxs[:, 2]
+            replace = (
+                True
+                if self.allow_repeat
+                or len(overlap_scores[overlap_scores > 0]) < num_views
+                else False
+            )
+            image_idxs = rng.choice(
+                image_idxs,
+                num_views,
+                replace=replace,
+                p=overlap_scores / np.sum(overlap_scores),
+            )
+            image_idxs = image_idxs.astype(np.int64)
+
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            img_path = self.imgid2path(view_idx, scene)
+            depth_path = img_path.replace("rgb", "depth").replace(".jpg", ".npy")
+            cam_path = img_path.replace("rgb", "cam").replace(".jpg", ".npz")
+            sky_mask_path = img_path.replace("rgb", "sky_mask")
+            image = imread_cv2(img_path)
+            depthmap = np.load(depth_path)
+            camera_params = np.load(cam_path)
+            sky_mask = cv2.imread(sky_mask_path, cv2.IMREAD_UNCHANGED) >= 127
+
+            intrinsics = camera_params["intrinsic"].astype(np.float32)
+            camera_pose = camera_params["pose"].astype(np.float32)
+
+            depthmap[sky_mask] = -1.0
+            depthmap[depthmap > 400.0] = 0.0
+            depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
+            threshold = (
+                np.percentile(depthmap[depthmap > 0], 98)
+                if depthmap[depthmap > 0].size > 0
+                else 0
+            )
+            depthmap[depthmap > threshold] = 0.0
+
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image, depthmap, intrinsics, resolution, rng, info=(img_path)
+            )
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=camera_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    dataset="MapFree",
+                    label=img_path,
+                    is_metric=self.is_metric,
+                    instance=img_path,
+                    is_video=ordered_video,
+                    quantile=np.array(0.96, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/megadepth.py b/extern/CUT3R/src/dust3r/datasets/megadepth.py
new file mode 100755
index 0000000000000000000000000000000000000000..321500f9260513f81c009fa8155be0612a5a4ba7
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/megadepth.py
@@ -0,0 +1,98 @@
+import os.path as osp
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class MegaDepth_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        self._load_data(self.split)
+        self.is_metric = False
+        if self.split is None:
+            pass
+        elif self.split == "train":
+            self.select_scene(("0015", "0022"), opposite=True)
+        elif self.split == "val":
+            self.select_scene(("0015", "0022"))
+        else:
+            raise ValueError(f"bad {self.split=}")
+
+    def _load_data(self, split):
+        with np.load(
+            osp.join(self.ROOT, "megadepth_sets_64.npz"), allow_pickle=True
+        ) as data:
+            self.all_scenes = data["scenes"]
+            self.all_images = data["images"]
+            self.sets = data["sets"]
+
+    def __len__(self):
+        return len(self.sets)
+
+    def get_image_num(self):
+        return len(self.all_images)
+
+    def get_stats(self):
+        return f"{len(self)} groups from {len(self.all_scenes)} scenes"
+
+    def select_scene(self, scene, *instances, opposite=False):
+        scenes = (scene,) if isinstance(scene, str) else tuple(scene)
+        scene_id = [s.startswith(scenes) for s in self.all_scenes]
+        assert any(scene_id), "no scene found"
+        valid = np.in1d(self.sets[:, 0], np.nonzero(scene_id)[0])
+        if instances:
+            raise NotImplementedError("selecting instances not implemented")
+        if opposite:
+            valid = ~valid
+        assert valid.any()
+        self.sets = self.sets[valid]
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        scene_id = self.sets[idx][0]
+        image_idxs = self.sets[idx][1:65]
+        replace = False if not self.allow_repeat else True
+        image_idxs = rng.choice(image_idxs, num_views, replace=replace)
+        scene, subscene = self.all_scenes[scene_id].split()
+        seq_path = osp.join(self.ROOT, scene, subscene)
+        views = []
+        for im_id in image_idxs:
+            img = self.all_images[im_id]
+            try:
+                image = imread_cv2(osp.join(seq_path, img + ".jpg"))
+                depthmap = imread_cv2(osp.join(seq_path, img + ".exr"))
+                camera_params = np.load(osp.join(seq_path, img + ".npz"))
+            except Exception as e:
+                raise OSError(f"cannot load {img}, got exception {e}")
+            intrinsics = np.float32(camera_params["intrinsics"])
+            camera_pose = np.float32(camera_params["cam2world"])
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image, depthmap, intrinsics, resolution, rng, info=(seq_path, img)
+            )
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=camera_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    dataset="MegaDepth",
+                    label=osp.relpath(seq_path, self.ROOT),
+                    is_metric=self.is_metric,
+                    instance=img,
+                    is_video=False,
+                    quantile=np.array(0.96, dtype=np.float32),
+                    img_mask=True,
+                    ray_mask=False,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/mp3d.py b/extern/CUT3R/src/dust3r/datasets/mp3d.py
new file mode 100755
index 0000000000000000000000000000000000000000..f88d39e1c56907c4cad9e105ad8ffa505aa362d1
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/mp3d.py
@@ -0,0 +1,132 @@
+import os.path as osp
+import os
+import sys
+import itertools
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+import cv2
+import numpy as np
+
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class MP3D_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, split, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = False
+        self.is_metric = True
+        super().__init__(*args, **kwargs)
+
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        scenes = os.listdir(self.ROOT)
+        offset = 0
+        overlaps = {scene: [] for scene in scenes}
+        scene_img_list = {scene: [] for scene in scenes}
+        images = []
+
+        j = 0
+        for scene in scenes:
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")]
+            )
+            overlap = np.load(osp.join(scene_dir, "overlap.npy"))
+            overlaps[scene] = overlap
+            num_imgs = len(basenames)
+
+            images.extend(
+                [(scene, i, basename) for i, basename in enumerate(basenames)]
+            )
+            scene_img_list[scene] = np.arange(num_imgs) + offset
+            offset += num_imgs
+            j += 1
+
+        self.scenes = scenes
+        self.scene_img_list = scene_img_list
+        self.images = images
+        self.overlaps = overlaps
+
+    def __len__(self):
+        return len(self.images)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        num_views_posible = 0
+        num_unique = num_views if not self.allow_repeat else max(num_views // 3, 3)
+        while num_views_posible < num_unique - 1:
+            scene, img_idx, _ = self.images[idx]
+            overlap = self.overlaps[scene]
+            sel_img_idx = np.where(overlap[:, 0] == img_idx)[0]
+            overlap_sel = overlap[sel_img_idx]
+            overlap_sel = overlap_sel[
+                (overlap_sel[:, 2] > 0.01) * (overlap_sel[:, 2] < 1)
+            ]
+            num_views_posible = len(overlap_sel)
+            if num_views_posible >= num_unique - 1:
+                break
+            idx = rng.choice(len(self.images))
+
+        ref_id = self.scene_img_list[scene][img_idx]
+        ids = self.scene_img_list[scene][overlap_sel[:, 1].astype(np.int64)]
+        replace = False if not self.allow_repeat else True
+        image_idxs = rng.choice(
+            ids,
+            num_views - 1,
+            replace=replace,
+            p=overlap_sel[:, 2] / np.sum(overlap_sel[:, 2]),
+        )
+        image_idxs = np.concatenate([[ref_id], image_idxs])
+
+        ordered_video = False
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene, _, basename = self.images[view_idx]
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_path = osp.join(scene_dir, "rgb", basename + ".png")
+            depth_path = osp.join(scene_dir, "depth", basename + ".npy")
+            cam_path = osp.join(scene_dir, "cam", basename + ".npz")
+
+            rgb_image = imread_cv2(rgb_path, cv2.IMREAD_COLOR)
+            depthmap = np.load(depth_path).astype(np.float32)
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+            cam_file = np.load(cam_path)
+            intrinsics = cam_file["intrinsics"]
+            camera_pose = cam_file["pose"]
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.85, 0.1, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="mp3d",
+                    label=scene + "_" + rgb_path,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(0.99, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/mvimgnet.py b/extern/CUT3R/src/dust3r/datasets/mvimgnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..9563f7f5dcd6120b460486b46415ad0e57c214c8
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/mvimgnet.py
@@ -0,0 +1,145 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class MVImgNet_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = False
+        self.max_interval = 32
+        super().__init__(*args, **kwargs)
+
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        self.scenes = os.listdir(self.ROOT)
+
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".jpg")]
+            )
+
+            num_imgs = len(basenames)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+
+            img_ids = list(np.arange(num_imgs) + offset)
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+
+            start_img_ids.extend([(scene, id) for id in start_img_ids_])
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+
+            # offset groups
+            offset += num_imgs
+            j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+
+        self.invalid_scenes = {scene: False for scene in self.scenes}
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        invalid_seq = True
+        scene, start_id = self.start_img_ids[idx]
+
+        while invalid_seq:
+            while self.invalid_scenes[scene]:
+                idx = rng.integers(low=0, high=len(self.start_img_ids))
+                scene, start_id = self.start_img_ids[idx]
+
+            all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+            pos, ordered_video = self.get_seq_from_start_id(
+                num_views, start_id, all_image_ids, rng, max_interval=self.max_interval
+            )
+            image_idxs = np.array(all_image_ids)[pos]
+
+            views = []
+            for view_idx in image_idxs:
+                scene_id = self.sceneids[view_idx]
+                scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+                rgb_dir = osp.join(scene_dir, "rgb")
+                cam_dir = osp.join(scene_dir, "cam")
+
+                basename = self.images[view_idx]
+
+                try:
+                    # Load RGB image
+                    rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".jpg"))
+                    # Load depthmap, no depth, set to all ones
+                    depthmap = np.ones_like(rgb_image[..., 0], dtype=np.float32)
+                    cam = np.load(osp.join(cam_dir, basename + ".npz"))
+                    camera_pose = cam["pose"]
+                    intrinsics = np.eye(3)
+                    intrinsics[0, 0] = cam["intrinsics"][0, 0]
+                    intrinsics[1, 1] = cam["intrinsics"][0, 0]
+                    intrinsics[0, 2] = cam["intrinsics"][1, 1]
+                    intrinsics[1, 2] = cam["intrinsics"][0, 2]
+                except:
+                    print(f"Error loading {scene} {basename}, skipping")
+                    self.invalid_scenes[scene] = True
+                    break
+
+                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                    rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+                )
+
+                views.append(
+                    dict(
+                        img=rgb_image,
+                        depthmap=depthmap.astype(np.float32),
+                        camera_pose=camera_pose.astype(np.float32),
+                        camera_intrinsics=intrinsics.astype(np.float32),
+                        dataset="MVImgnet",
+                        label=self.scenes[scene_id] + "_" + basename,
+                        instance=f"{str(idx)}_{str(view_idx)}",
+                        is_metric=self.is_metric,
+                        is_video=ordered_video,
+                        quantile=np.array(0.98, dtype=np.float32),
+                        img_mask=True,
+                        ray_mask=False,
+                        camera_only=True,
+                        depth_only=False,
+                        single_view=False,
+                        reset=False,
+                    )
+                )
+            if len(views) == num_views:
+                invalid_seq = False
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/mvs_synth.py b/extern/CUT3R/src/dust3r/datasets/mvs_synth.py
new file mode 100644
index 0000000000000000000000000000000000000000..5492801a1bfadd28bae329c52c9cfd1da4e9c779
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/mvs_synth.py
@@ -0,0 +1,143 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class MVS_Synth_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = False
+        self.max_interval = 4
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        self.scenes = os.listdir(self.ROOT)
+
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".jpg")]
+            )
+            num_imgs = len(basenames)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+            img_ids = list(np.arange(num_imgs) + offset)
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+
+            start_img_ids.extend(start_img_ids_)
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+
+            # offset groups
+            offset += num_imgs
+            j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            video_prob=1.0,
+            fix_interval_prob=1.0,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+            rgb_dir = osp.join(scene_dir, "rgb")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(scene_dir, "cam")
+
+            basename = self.images[view_idx]
+
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".jpg"))
+            # Load depthmap
+            depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+            threshold = (
+                np.percentile(depthmap[depthmap > 0], 98)
+                if depthmap[depthmap > 0].size > 0
+                else 0
+            )
+            depthmap[depthmap > threshold] = 0.0
+            depthmap[depthmap > 1000] = 0.0
+
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            camera_pose = cam["pose"]
+            intrinsics = cam["intrinsics"]
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.8, 0.15, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="MVS_Synth",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=osp.join(rgb_dir, basename + ".jpg"),
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/omniobject3d.py b/extern/CUT3R/src/dust3r/datasets/omniobject3d.py
new file mode 100755
index 0000000000000000000000000000000000000000..1cce8c52ece21476798fe3b310bf78e83125456a
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/omniobject3d.py
@@ -0,0 +1,146 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+import json
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+import re
+
+
+def extract_number(filename):
+    match = re.search(r"\d+", filename)
+    if match:
+        return int(match.group())
+    return 0
+
+
+class OmniObject3D_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = False
+        self.is_metric = False  # True
+        super().__init__(*args, **kwargs)
+
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        self.scenes = [
+            d
+            for d in os.listdir(self.ROOT)
+            if os.path.isdir(os.path.join(self.ROOT, d))
+        ]
+        with open(os.path.join(self.ROOT, "scale.json"), "r") as f:
+            self.scales = json.load(f)
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")],
+                key=extract_number,
+            )
+
+            num_imgs = len(basenames)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+            img_ids = list(np.arange(num_imgs) + offset)
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+
+            start_img_ids.extend([(scene, id) for id in start_img_ids_])
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+
+            # offset groups
+            offset += num_imgs
+            j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        scene, start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views, start_id, all_image_ids, rng, max_interval=100, video_prob=0.0
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+            rgb_dir = osp.join(scene_dir, "rgb")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(scene_dir, "cam")
+
+            basename = self.images[view_idx]
+
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png"))
+            depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            camera_pose = cam["pose"]
+            intrinsics = cam["intrinsics"]
+            scale = self.scales[self.scenes[scene_id]]
+            depthmap = depthmap / scale / 1000.0
+            camera_pose[:3, 3] = camera_pose[:3, 3] / scale / 1000.0
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.8, 0.15, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="OmniObject3D",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/pointodyssey.py b/extern/CUT3R/src/dust3r/datasets/pointodyssey.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ced302f1bdaed09fc2294fd6c3a7dd8e248f964
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/pointodyssey.py
@@ -0,0 +1,178 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class PointOdyssey_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 4
+        super().__init__(*args, **kwargs)
+        assert self.split in ["train", "test", "val"]
+        self.scenes_to_use = [
+            # 'cab_h_bench_3rd', 'cab_h_bench_ego1', 'cab_h_bench_ego2',
+            "cnb_dlab_0215_3rd",
+            "cnb_dlab_0215_ego1",
+            "cnb_dlab_0225_3rd",
+            "cnb_dlab_0225_ego1",
+            "dancing",
+            "dancingroom0_3rd",
+            "footlab_3rd",
+            "footlab_ego1",
+            "footlab_ego2",
+            "girl",
+            "girl_egocentric",
+            "human_egocentric",
+            "human_in_scene",
+            "human_in_scene1",
+            "kg",
+            "kg_ego1",
+            "kg_ego2",
+            "kitchen_gfloor",
+            "kitchen_gfloor_ego1",
+            "kitchen_gfloor_ego2",
+            "scene_carb_h_tables",
+            "scene_carb_h_tables_ego1",
+            "scene_carb_h_tables_ego2",
+            "scene_j716_3rd",
+            "scene_j716_ego1",
+            "scene_j716_ego2",
+            "scene_recording_20210910_S05_S06_0_3rd",
+            "scene_recording_20210910_S05_S06_0_ego2",
+            "scene1_0129",
+            "scene1_0129_ego",
+            "seminar_h52_3rd",
+            "seminar_h52_ego1",
+            "seminar_h52_ego2",
+        ]
+        self.loaded_data = self._load_data(self.split)
+
+    def _load_data(self, split):
+        root = os.path.join(self.ROOT, split)
+        self.scenes = []
+
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+
+        j = 0
+        for scene in tqdm(os.listdir(root)):
+            if scene not in self.scenes_to_use:
+                continue
+            scene_dir = osp.join(root, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".jpg")]
+            )
+            num_imgs = len(basenames)
+            img_ids = list(np.arange(num_imgs) + offset)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+            # start_img_ids_ = img_ids[:-self.num_views+1]
+
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+
+            start_img_ids.extend(start_img_ids_)
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+
+            # offset groups
+            offset += num_imgs
+            j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            video_prob=1.0,
+            fix_interval_prob=1.0,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.split, self.scenes[scene_id])
+            rgb_dir = osp.join(scene_dir, "rgb")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(scene_dir, "cam")
+
+            basename = self.images[view_idx]
+
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".jpg"))
+            # Load depthmap
+            depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+            depthmap[depthmap > 1000] = 0.0
+
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            camera_pose = cam["pose"]
+            intrinsics = cam["intrinsics"]
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.9, 0.05, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="PointOdyssey",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=osp.join(rgb_dir, basename + ".jpg"),
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/realestate10k.py b/extern/CUT3R/src/dust3r/datasets/realestate10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..34526946529905640be4ee49d0530b950bafdb04
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/realestate10k.py
@@ -0,0 +1,139 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class RE10K_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = False
+        self.max_interval = 128
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        self.scenes = os.listdir(self.ROOT)
+
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")],
+                key=lambda x: int(x),
+            )
+
+            num_imgs = len(basenames)
+            img_ids = list(np.arange(num_imgs) + offset)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+
+            start_img_ids.extend([(scene, id) for id in start_img_ids_])
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+
+            # offset groups
+            offset += num_imgs
+            j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+
+        self.invalid_scenes = {scene: False for scene in self.scenes}
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        invalid_seq = True
+        scene, start_id = self.start_img_ids[idx]
+
+        while invalid_seq:
+            while self.invalid_scenes[scene]:
+                idx = rng.integers(low=0, high=len(self.start_img_ids))
+                scene, start_id = self.start_img_ids[idx]
+
+            all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+            pos, ordered_video = self.get_seq_from_start_id(
+                num_views, start_id, all_image_ids, rng, max_interval=self.max_interval
+            )
+            image_idxs = np.array(all_image_ids)[pos]
+
+            views = []
+            for view_idx in image_idxs:
+                scene_id = self.sceneids[view_idx]
+                scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+                rgb_dir = osp.join(scene_dir, "rgb")
+                cam_dir = osp.join(scene_dir, "cam")
+
+                basename = self.images[view_idx]
+
+                try:
+                    # Load RGB image
+                    rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png"))
+                    # Load depthmap, no depth, set to all ones
+                    depthmap = np.ones_like(rgb_image[..., 0], dtype=np.float32)
+                    cam = np.load(osp.join(cam_dir, basename + ".npz"))
+                    intrinsics = cam["intrinsics"]
+                    camera_pose = cam["pose"]
+                except:
+                    print(f"Error loading {scene} {basename}, skipping")
+                    self.invalid_scenes[scene] = True
+                    break
+
+                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                    rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+                )
+
+                views.append(
+                    dict(
+                        img=rgb_image,
+                        depthmap=depthmap.astype(np.float32),
+                        camera_pose=camera_pose.astype(np.float32),
+                        camera_intrinsics=intrinsics.astype(np.float32),
+                        dataset="realestate10k",
+                        label=self.scenes[scene_id] + "_" + basename,
+                        instance=f"{str(idx)}_{str(view_idx)}",
+                        is_metric=self.is_metric,
+                        is_video=ordered_video,
+                        quantile=np.array(0.98, dtype=np.float32),
+                        img_mask=True,
+                        ray_mask=False,
+                        camera_only=True,
+                        depth_only=False,
+                        single_view=False,
+                        reset=False,
+                    )
+                )
+            if len(views) == num_views:
+                invalid_seq = False
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/scannet.py b/extern/CUT3R/src/dust3r/datasets/scannet.py
new file mode 100755
index 0000000000000000000000000000000000000000..a4eb2fd3799a0bda6f1d3de6f0d73dee79b12d82
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/scannet.py
@@ -0,0 +1,148 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class ScanNet_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 30
+        super().__init__(*args, **kwargs)
+
+        self.loaded_data = self._load_data(self.split)
+
+    def _load_data(self, split):
+        self.scene_root = osp.join(
+            self.ROOT, "scans_train" if split == "train" else "scans_test"
+        )
+        self.scenes = [
+            scene for scene in os.listdir(self.scene_root) if scene.startswith("scene")
+        ]
+
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.scene_root, scene)
+            with np.load(
+                osp.join(scene_dir, "new_scene_metadata.npz"), allow_pickle=True
+            ) as data:
+                basenames = data["images"]
+                num_imgs = len(basenames)
+                img_ids = list(np.arange(num_imgs) + offset)
+                cut_off = (
+                    self.num_views
+                    if not self.allow_repeat
+                    else max(self.num_views // 3, 3)
+                )
+                start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+
+                if num_imgs < cut_off:
+                    print(f"Skipping {scene}")
+                    continue
+
+                start_img_ids.extend(start_img_ids_)
+                sceneids.extend([j] * num_imgs)
+                images.extend(basenames)
+                scenes.append(scene)
+                scene_img_list.append(img_ids)
+
+                # offset groups
+                offset += num_imgs
+                j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            video_prob=0.6,
+            fix_interval_prob=0.6,
+            block_shuffle=16,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.scene_root, self.scenes[scene_id])
+            rgb_dir = osp.join(scene_dir, "color")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(scene_dir, "cam")
+
+            basename = self.images[view_idx]
+
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".jpg"))
+            # Load depthmap
+            depthmap = imread_cv2(
+                osp.join(depth_dir, basename + ".png"), cv2.IMREAD_UNCHANGED
+            )
+            depthmap = depthmap.astype(np.float32) / 1000
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            camera_pose = cam["pose"]
+            intrinsics = cam["intrinsics"]
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="ScanNet",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(0.98, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/scannetpp.py b/extern/CUT3R/src/dust3r/datasets/scannetpp.py
new file mode 100755
index 0000000000000000000000000000000000000000..5ef363ef49638f3b4599da865d545d32462f34e4
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/scannetpp.py
@@ -0,0 +1,191 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class ScanNetpp_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 3
+        super().__init__(*args, **kwargs)
+        assert self.split == "train"
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        with np.load(osp.join(self.ROOT, "all_metadata.npz")) as data:
+            self.scenes = data["scenes"]
+        offset = 0
+        scenes = []
+        sceneids = []
+        images = []
+        intrinsics = []
+        trajectories = []
+        groups = []
+        id_ranges = []
+        j = 0
+        self.image_num = 0
+        for scene in self.scenes:
+            scene_dir = osp.join(self.ROOT, scene)
+            with np.load(
+                osp.join(scene_dir, "new_scene_metadata.npz"), allow_pickle=True
+            ) as data:
+                imgs = data["images"]
+                self.image_num += len(imgs)
+                img_ids = np.arange(len(imgs)).tolist()
+                intrins = data["intrinsics"]
+                traj = data["trajectories"]
+                imgs_on_disk = sorted(os.listdir(osp.join(scene_dir, "images")))
+                imgs_on_disk = list(map(lambda x: x[:-4], imgs_on_disk))
+
+                dslr_ids = [
+                    i + offset
+                    for i in img_ids
+                    if imgs[i].startswith("DSC") and imgs[i] in imgs_on_disk
+                ]
+                iphone_ids = [
+                    i + offset
+                    for i in img_ids
+                    if imgs[i].startswith("frame") and imgs[i] in imgs_on_disk
+                ]
+
+                num_imgs = len(imgs)
+                assert max(dslr_ids) < min(iphone_ids)
+                assert "image_collection" in data
+
+                img_groups = []
+                img_id_ranges = []
+
+                for ref_id, group in data["image_collection"].item().items():
+                    if len(group) + 1 < self.num_views:
+                        continue
+                    group.insert(0, (ref_id, 1.0))
+                    sorted_group = sorted(group, key=lambda x: x[1], reverse=True)
+                    group = [int(x[0] + offset) for x in sorted_group]
+                    img_groups.append(sorted(group))
+
+                    if imgs[ref_id].startswith("frame"):
+                        img_id_ranges.append(dslr_ids)
+                    else:
+                        img_id_ranges.append(iphone_ids)
+
+                if len(img_groups) == 0:
+                    print(f"Skipping {scene}")
+                    continue
+                scenes.append(scene)
+                sceneids.extend([j] * num_imgs)
+                images.extend(imgs)
+                intrinsics.append(intrins)
+                trajectories.append(traj)
+
+                # offset groups
+                groups.extend(img_groups)
+                id_ranges.extend(img_id_ranges)
+                offset += num_imgs
+                j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.intrinsics = np.concatenate(intrinsics, axis=0)
+        self.trajectories = np.concatenate(trajectories, axis=0)
+        self.id_ranges = id_ranges
+        self.groups = groups
+
+    def __len__(self):
+        return len(self.groups) * 10
+
+    def get_image_num(self):
+        return self.image_num
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        idx = idx // 10
+        image_idxs = self.groups[idx]
+        rand_val = rng.random()
+
+        image_idxs_video = self.id_ranges[idx]
+        cut_off = num_views if not self.allow_repeat else max(num_views // 3, 3)
+        start_image_idxs = image_idxs_video[: len(image_idxs_video) - cut_off + 1]
+
+        if rand_val < 0.7 and len(start_image_idxs) > 0:
+            start_id = rng.choice(start_image_idxs)
+            pos, ordered_video = self.get_seq_from_start_id(
+                num_views,
+                start_id,
+                image_idxs_video,
+                rng,
+                max_interval=self.max_interval,
+                video_prob=0.8,
+                fix_interval_prob=0.5,
+                block_shuffle=16,
+            )
+            image_idxs = np.array(image_idxs_video)[pos]
+
+        else:
+            ordered_video = True
+            # ordered video with varying intervals
+            num_candidates = len(image_idxs)
+            max_id = min(num_candidates, int(num_views * (2 + 2 * rng.random())))
+            image_idxs = sorted(rng.permutation(image_idxs[:max_id])[:num_views])
+            if rand_val > 0.75:
+                ordered_video = False
+                image_idxs = rng.permutation(image_idxs)
+
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+
+            intrinsics = self.intrinsics[view_idx]
+            camera_pose = self.trajectories[view_idx]
+            basename = self.images[view_idx]
+
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(scene_dir, "images", basename + ".jpg"))
+            # Load depthmap
+            depthmap = imread_cv2(
+                osp.join(scene_dir, "depth", basename + ".png"), cv2.IMREAD_UNCHANGED
+            )
+            depthmap = depthmap.astype(np.float32) / 1000
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="ScanNet++",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(0.99, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/smartportraits.py b/extern/CUT3R/src/dust3r/datasets/smartportraits.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5955aecd651f2bf1f6a666b0869b5d97816cf5f
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/smartportraits.py
@@ -0,0 +1,85 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class SmartPortraits_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        scenes = os.listdir(self.ROOT)
+        img_names = []
+        for scene in scenes:
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")]
+            )
+            img_names.extend([(scene, basename) for basename in basenames])
+
+        self.img_names = img_names
+
+    def __len__(self):
+        return len(self.img_names)
+
+    def get_image_num(self):
+        return len(self.img_names)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        new_seed = rng.integers(0, 2**32) + idx
+        new_rng = np.random.default_rng(new_seed)
+        img_names = new_rng.choice(self.img_names, num_views, replace=False)
+
+        views = []
+        for v, img_name in enumerate(img_names):
+            # Load RGB image
+            scene, img_name = img_name
+            rgb_image = imread_cv2(osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"))
+            depthmap = np.load(osp.join(self.ROOT, scene, "depth", f"{img_name}.npy"))
+            depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
+
+            intrinsics = np.load(osp.join(self.ROOT, scene, "cam", f"{img_name}.npz"))[
+                "intrinsics"
+            ]
+            # camera pose is not provided, placeholder
+            camera_pose = np.eye(4)
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=img_name
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="SmartPortraits",
+                    label=img_name,
+                    instance=osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"),
+                    is_metric=self.is_metric,
+                    is_video=False,
+                    quantile=np.array(0.98, dtype=np.float32),
+                    img_mask=True,
+                    ray_mask=False,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=True,
+                    reset=True,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/spring.py b/extern/CUT3R/src/dust3r/datasets/spring.py
new file mode 100755
index 0000000000000000000000000000000000000000..39bc760a36f56be0e5020e5adacd6eb913aaca6d
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/spring.py
@@ -0,0 +1,137 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class Spring(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 16
+        super().__init__(*args, **kwargs)
+
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        self.scenes = os.listdir(self.ROOT)
+
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")]
+            )
+            num_imgs = len(basenames)
+            img_ids = list(np.arange(num_imgs) + offset)
+            # start_img_ids_ = img_ids[:-self.num_views+1]
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+
+            start_img_ids.extend(start_img_ids_)
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+
+            # offset groups
+            offset += num_imgs
+            j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            video_prob=1.0,
+            fix_interval_prob=1.0,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+            rgb_dir = osp.join(scene_dir, "rgb")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(scene_dir, "cam")
+
+            basename = self.images[view_idx]
+
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png"))
+            # Load depthmap
+            depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            camera_pose = cam["pose"]
+            intrinsics = cam["intrinsics"]
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.85, 0.10, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="spring",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/synscapes.py b/extern/CUT3R/src/dust3r/datasets/synscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..92f4fc8506558ec16f50b71d2feacc07ea2f3a18
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/synscapes.py
@@ -0,0 +1,85 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class SynScapes(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = False
+        self.is_metric = True
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        rgb_dir = osp.join(self.ROOT, "rgb")
+        basenames = sorted(
+            [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")],
+            key=lambda x: int(x),
+        )
+        self.img_names = basenames
+
+    def __len__(self):
+        return len(self.img_names)
+
+    def get_image_num(self):
+        return len(self.img_names)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        new_seed = rng.integers(0, 2**32) + idx
+        new_rng = np.random.default_rng(new_seed)
+        img_names = new_rng.choice(self.img_names, num_views, replace=False)
+
+        views = []
+        for v, img_name in enumerate(img_names):
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(self.ROOT, "rgb", f"{img_name}.png"))
+            depthmap = np.load(osp.join(self.ROOT, "depth", f"{img_name}.npy"))
+            sky_mask = (
+                imread_cv2(osp.join(self.ROOT, "sky_mask", f"{img_name}.png"))[..., 0]
+                >= 127
+            )
+            depthmap[sky_mask] = -1.0
+            depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
+            depthmap[depthmap > 200] = 0.0
+
+            intrinsics = np.load(osp.join(self.ROOT, "cam", f"{img_name}.npz"))[
+                "intrinsics"
+            ]
+            # camera pose is not provided, placeholder
+            camera_pose = np.eye(4)
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=img_name
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="synscapes",
+                    label=img_name,
+                    instance=f"{str(idx)}_{img_name}",
+                    is_metric=self.is_metric,
+                    is_video=False,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=True,
+                    ray_mask=False,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=True,
+                    reset=True,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/tartanair.py b/extern/CUT3R/src/dust3r/datasets/tartanair.py
new file mode 100644
index 0000000000000000000000000000000000000000..760d0e9d6921bb31354fbe505821b550d301f83a
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/tartanair.py
@@ -0,0 +1,164 @@
+import os.path as osp
+import numpy as np
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class TartanAir_Multi(BaseMultiViewDataset):
+
+    def __init__(self, ROOT, *args, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 20
+        super().__init__(*args, **kwargs)
+        # loading all
+        assert self.split is None
+        self._load_data()
+
+    def _load_data(self):
+        scene_dirs = sorted(
+            [
+                d
+                for d in os.listdir(self.ROOT)
+                if os.path.isdir(os.path.join(self.ROOT, d))
+            ]
+        )
+
+        offset = 0
+        scenes = []
+        sceneids = []
+        images = []
+        scene_img_list = []
+        start_img_ids = []
+        j = 0
+
+        for scene in scene_dirs:
+            for mode in ["Easy", "Hard"]:
+                seq_dirs = sorted(
+                    [
+                        os.path.join(self.ROOT, scene, mode, d)
+                        for d in os.listdir(os.path.join(self.ROOT, scene, mode))
+                        if os.path.isdir(os.path.join(self.ROOT, scene, mode, d))
+                    ]
+                )
+                for seq_dir in seq_dirs:
+                    basenames = sorted(
+                        [f[:-8] for f in os.listdir(seq_dir) if f.endswith(".png")]
+                    )
+                    num_imgs = len(basenames)
+                    cut_off = (
+                        self.num_views
+                        if not self.allow_repeat
+                        else max(self.num_views // 3, 3)
+                    )
+
+                    if num_imgs < cut_off:
+                        print(f"Skipping {scene}")
+                        continue
+                    img_ids = list(np.arange(num_imgs) + offset)
+                    start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+
+                    scenes.append(seq_dir)
+                    scene_img_list.append(img_ids)
+                    sceneids.extend([j] * num_imgs)
+                    images.extend(basenames)
+                    start_img_ids.extend(start_img_ids_)
+                    offset += num_imgs
+                    j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def get_stats(self):
+        return f"{len(self)} groups of views"
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        scene_id = self.sceneids[start_id]
+        all_image_ids = self.scene_img_list[scene_id]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            video_prob=0.8,
+            fix_interval_prob=0.8,
+            block_shuffle=16,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+
+        views = []
+
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = self.scenes[scene_id]
+            basename = self.images[view_idx]
+
+            img = basename + "_rgb.png"
+            image = imread_cv2(osp.join(scene_dir, img))
+            depthmap = np.load(osp.join(scene_dir, basename + "_depth.npy"))
+            camera_params = np.load(osp.join(scene_dir, basename + "_cam.npz"))
+
+            intrinsics = camera_params["camera_intrinsics"]
+            camera_pose = camera_params["camera_pose"]
+
+            sky_mask = depthmap >= 1000
+            depthmap[sky_mask] = -1.0  # sky
+            depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
+            threshold = (
+                np.percentile(depthmap[depthmap > 0], 98)
+                if depthmap[depthmap > 0].size > 0
+                else 0
+            )
+            depthmap[depthmap > threshold] = 0.0
+
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image, depthmap, intrinsics, resolution, rng, info=(scene_dir, img)
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=camera_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    dataset="TartanAir",
+                    label=scene_dir,
+                    is_metric=self.is_metric,
+                    instance=scene_dir + "_" + img,
+                    is_video=ordered_video,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/threedkb.py b/extern/CUT3R/src/dust3r/datasets/threedkb.py
new file mode 100644
index 0000000000000000000000000000000000000000..face09abd00f76cd62e7654b1b673e9d1d3394b7
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/threedkb.py
@@ -0,0 +1,111 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class ThreeDKenBurns(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = False
+        self.is_metric = False
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        self.scenes = os.listdir(self.ROOT)
+
+        offset = 0
+        scenes = []
+        sceneids = []
+        images = []
+        img_ids = []
+
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")]
+            )
+
+            num_imgs = len(basenames)
+            img_ids_ = list(np.arange(num_imgs) + offset)
+
+            img_ids.extend(img_ids_)
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+
+            # offset groups
+            offset += num_imgs
+            j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.img_ids = img_ids
+
+    def __len__(self):
+        return len(self.img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        new_seed = rng.integers(0, 2**32) + idx
+        new_rng = np.random.default_rng(new_seed)
+        image_idxs = new_rng.choice(self.img_ids, num_views, replace=False)
+
+        views = []
+        for view_idx in image_idxs:
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+            rgb_dir = osp.join(scene_dir, "rgb")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(scene_dir, "cam")
+
+            basename = self.images[view_idx]
+
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png"))
+            depthmap = imread_cv2(osp.join(depth_dir, basename + ".exr"))
+            depthmap[depthmap > 20000] = 0.0
+            depthmap = depthmap / 1000.0
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            intrinsics = cam["intrinsics"]
+            camera_pose = np.eye(4)
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="3DKenBurns",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=False,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=True,
+                    ray_mask=False,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=True,
+                    reset=True,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/uasol.py b/extern/CUT3R/src/dust3r/datasets/uasol.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91b43bdd6a27691ac5016b22c183ac300d219a9
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/uasol.py
@@ -0,0 +1,148 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+import re
+
+
+def extract_number(filename):
+    match = re.search(r"\d+", filename)
+    if match:
+        return int(match.group())
+    return 0
+
+
+class UASOL_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 40
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        self.scenes = os.listdir(self.ROOT)
+
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")],
+                key=extract_number,
+            )
+            num_imgs = len(basenames)
+            img_ids = list(np.arange(num_imgs) + offset)
+            # start_img_ids_ = img_ids[:-self.num_views+1]
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+
+            start_img_ids.extend(start_img_ids_)
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+
+            # offset groups
+            offset += num_imgs
+            j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            video_prob=0.75,
+            fix_interval_prob=0.75,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+            rgb_dir = osp.join(scene_dir, "rgb")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(scene_dir, "cam")
+
+            basename = self.images[view_idx]
+
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png"))
+            # Load depthmap
+            depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+            depthmap[depthmap >= 20] = 0  # invalid
+
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            camera_pose = cam["pose"]
+            intrinsics = cam["intrinsics"]
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="UASOL",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=osp.join(rgb_dir, basename + ".png"),
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(0.9, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/unreal4k.py b/extern/CUT3R/src/dust3r/datasets/unreal4k.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d9092928daacf527c99e1958bbee85ef9110035
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/unreal4k.py
@@ -0,0 +1,159 @@
+import os.path as osp
+import numpy as np
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+R_conv = np.array([[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]).astype(
+    np.float32
+)
+
+
+class UnReal4K_Multi(BaseMultiViewDataset):
+
+    def __init__(self, ROOT, *args, **kwargs):
+        self.ROOT = ROOT
+        self.max_interval = 2
+        self.is_metric = True
+        super().__init__(*args, **kwargs)
+        # loading all
+        assert self.split is None
+        self._load_data()
+
+    def _load_data(self):
+        scene_dirs = sorted(
+            [
+                d
+                for d in os.listdir(self.ROOT)
+                if os.path.isdir(os.path.join(self.ROOT, d))
+            ]
+        )
+
+        offset = 0
+        scenes = []
+        sceneids = []
+        images = []
+        start_img_ids = []
+        scene_img_list = []
+        j = 0
+
+        seq_dirs = sorted(
+            [
+                os.path.join(self.ROOT, scene, mode)
+                for scene in scene_dirs
+                for mode in ["0", "1"]
+            ]
+        )
+        for seq_dir in seq_dirs:
+            basenames = sorted(
+                [f[:-8] for f in os.listdir(seq_dir) if f.endswith(".png")]
+            )
+            num_imgs = len(basenames)
+            img_ids = list(np.arange(num_imgs) + offset)
+            # start_img_ids_ = img_ids[:-self.num_views+1]
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+
+            if num_imgs < cut_off:
+                print(f"Skipping {seq_dir}")
+                continue
+
+            start_img_ids.extend(start_img_ids_)
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(seq_dir)
+            scene_img_list.append(img_ids)
+
+            # offset groups
+            offset += num_imgs
+            j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+
+    def __len__(self):
+        return len(self.start_img_ids) * 10
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def get_stats(self):
+        return f"{len(self)//10} groups of views"
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        idx = idx // 10
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views, start_id, all_image_ids, rng, max_interval=self.max_interval
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+
+        views = []
+
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = self.scenes[scene_id]
+            basename = self.images[view_idx]
+
+            img = basename + "_rgb.png"
+            image = imread_cv2(osp.join(scene_dir, img))
+            depthmap = np.load(osp.join(scene_dir, basename + "_depth.npy"))
+            camera_params = np.load(osp.join(scene_dir, basename + ".npz"))
+
+            intrinsics = camera_params["intrinsics"].astype(np.float32)
+            camera_pose = camera_params["cam2world"].astype(np.float32)
+
+            camera_pose = R_conv @ camera_pose
+
+            sky_mask = depthmap >= 1000
+            depthmap[sky_mask] = -1.0  # sky
+            threshold = (
+                np.percentile(depthmap[depthmap > 0], 98)
+                if depthmap[depthmap > 0].size > 0
+                else 0
+            )
+            depthmap[depthmap > threshold] = 0.0
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image, depthmap, intrinsics, resolution, rng, info=(scene_dir, img)
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=camera_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    dataset="UnReal4K",
+                    label=scene_dir,
+                    is_metric=self.is_metric,
+                    instance=scene_dir + "_" + img,
+                    is_video=ordered_video,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/urbansyn.py b/extern/CUT3R/src/dust3r/datasets/urbansyn.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3654a1200fffc1ae1c23483c752e06452f91310
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/urbansyn.py
@@ -0,0 +1,82 @@
+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class UrbanSyn(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = False
+        self.is_metric = True
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        rgb_dir = osp.join(self.ROOT, "rgb")
+        basenames = sorted([f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")])
+        self.img_names = basenames
+
+    def __len__(self):
+        return len(self.img_names)
+
+    def get_image_num(self):
+        return len(self.img_names)
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        new_seed = rng.integers(0, 2**32) + idx
+        new_rng = np.random.default_rng(new_seed)
+        img_names = new_rng.choice(self.img_names, num_views, replace=False)
+
+        views = []
+        for img_name in img_names:
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(self.ROOT, "rgb", f"{img_name}.png"))
+            depthmap = np.load(osp.join(self.ROOT, "depth", f"{img_name}.npy"))
+            sky_mask = (
+                imread_cv2(osp.join(self.ROOT, "sky_mask", f"{img_name}.png"))[..., 0]
+                >= 127
+            )
+            depthmap[sky_mask] = -1.0
+            depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
+            depthmap[depthmap > 200] = 0.0
+
+            intrinsics = np.load(osp.join(self.ROOT, "cam", f"{img_name}.npz"))[
+                "intrinsics"
+            ]
+            # camera pose is not provided, placeholder
+            camera_pose = np.eye(4)
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=img_name
+            )
+
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="urbansyn",
+                    label=img_name,
+                    instance=f"{str(idx)}_{img_name}",
+                    is_metric=self.is_metric,
+                    is_video=False,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=True,
+                    ray_mask=False,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=True,
+                    reset=True,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/utils/__init__.py b/extern/CUT3R/src/dust3r/datasets/utils/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/utils/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
diff --git a/extern/CUT3R/src/dust3r/datasets/utils/corr.py b/extern/CUT3R/src/dust3r/datasets/utils/corr.py
new file mode 100755
index 0000000000000000000000000000000000000000..a0413d4cc035f21acd9b02fb2bccebe36ab57736
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/utils/corr.py
@@ -0,0 +1,129 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+import numpy as np
+from dust3r.utils.device import to_numpy
+from dust3r.utils.geometry import inv, geotrf
+
+
+def reproject_view(pts3d, view2):
+    shape = view2["pts3d"].shape[:2]
+    return reproject(
+        pts3d, view2["camera_intrinsics"], inv(view2["camera_pose"]), shape
+    )
+
+
+def reproject(pts3d, K, world2cam, shape):
+    H, W, THREE = pts3d.shape
+    assert THREE == 3
+
+    # reproject in camera2 space
+    with np.errstate(divide="ignore", invalid="ignore"):
+        pos = geotrf(K @ world2cam[:3], pts3d, norm=1, ncol=2)
+
+    # quantize to pixel positions
+    return (H, W), ravel_xy(pos, shape)
+
+
+def ravel_xy(pos, shape):
+    H, W = shape
+    with np.errstate(invalid="ignore"):
+        qx, qy = pos.reshape(-1, 2).round().astype(np.int32).T
+    quantized_pos = qx.clip(min=0, max=W - 1, out=qx) + W * qy.clip(
+        min=0, max=H - 1, out=qy
+    )
+    return quantized_pos
+
+
+def unravel_xy(pos, shape):
+    # convert (x+W*y) back to 2d (x,y) coordinates
+    return np.unravel_index(pos, shape)[0].base[:, ::-1].copy()
+
+
+def reciprocal_1d(corres_1_to_2, corres_2_to_1, ret_recip=False):
+    is_reciprocal1 = corres_2_to_1[corres_1_to_2] == np.arange(len(corres_1_to_2))
+    pos1 = is_reciprocal1.nonzero()[0]
+    pos2 = corres_1_to_2[pos1]
+    if ret_recip:
+        return is_reciprocal1, pos1, pos2
+    return pos1, pos2
+
+
+def extract_correspondences_from_pts3d(
+    view1, view2, target_n_corres, rng=np.random, ret_xy=True, nneg=0
+):
+    view1, view2 = to_numpy((view1, view2))
+    # project pixels from image1 --> 3d points --> image2 pixels
+    shape1, corres1_to_2 = reproject_view(view1["pts3d"], view2)
+    shape2, corres2_to_1 = reproject_view(view2["pts3d"], view1)
+
+    # compute reciprocal correspondences:
+    # pos1 == valid pixels (correspondences) in image1
+    is_reciprocal1, pos1, pos2 = reciprocal_1d(
+        corres1_to_2, corres2_to_1, ret_recip=True
+    )
+    is_reciprocal2 = corres1_to_2[corres2_to_1] == np.arange(len(corres2_to_1))
+
+    if target_n_corres is None:
+        if ret_xy:
+            pos1 = unravel_xy(pos1, shape1)
+            pos2 = unravel_xy(pos2, shape2)
+        return pos1, pos2
+
+    available_negatives = min((~is_reciprocal1).sum(), (~is_reciprocal2).sum())
+    target_n_positives = int(target_n_corres * (1 - nneg))
+    n_positives = min(len(pos1), target_n_positives)
+    n_negatives = min(target_n_corres - n_positives, available_negatives)
+
+    if n_negatives + n_positives != target_n_corres:
+        # should be really rare => when there are not enough negatives
+        # in that case, break nneg and add a few more positives ?
+        n_positives = target_n_corres - n_negatives
+        assert n_positives <= len(pos1)
+
+    assert n_positives <= len(pos1)
+    assert n_positives <= len(pos2)
+    assert n_negatives <= (~is_reciprocal1).sum()
+    assert n_negatives <= (~is_reciprocal2).sum()
+    assert n_positives + n_negatives == target_n_corres
+
+    valid = np.ones(n_positives, dtype=bool)
+    if n_positives < len(pos1):
+        # random sub-sampling of valid correspondences
+        perm = rng.permutation(len(pos1))[:n_positives]
+        pos1 = pos1[perm]
+        pos2 = pos2[perm]
+
+    if n_negatives > 0:
+        # add false correspondences if not enough
+        def norm(p):
+            return p / p.sum()
+
+        pos1 = np.r_[
+            pos1,
+            rng.choice(
+                shape1[0] * shape1[1],
+                size=n_negatives,
+                replace=False,
+                p=norm(~is_reciprocal1),
+            ),
+        ]
+        pos2 = np.r_[
+            pos2,
+            rng.choice(
+                shape2[0] * shape2[1],
+                size=n_negatives,
+                replace=False,
+                p=norm(~is_reciprocal2),
+            ),
+        ]
+        valid = np.r_[valid, np.zeros(n_negatives, dtype=bool)]
+
+    # convert (x+W*y) back to 2d (x,y) coordinates
+    if ret_xy:
+        pos1 = unravel_xy(pos1, shape1)
+        pos2 = unravel_xy(pos2, shape2)
+    return pos1, pos2, valid
diff --git a/extern/CUT3R/src/dust3r/datasets/utils/cropping.py b/extern/CUT3R/src/dust3r/datasets/utils/cropping.py
new file mode 100755
index 0000000000000000000000000000000000000000..6074f0d93b54ef5af36189276e0f179825a525fe
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/utils/cropping.py
@@ -0,0 +1,147 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# croppping utilities
+# --------------------------------------------------------
+import PIL.Image
+import os
+
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2  # noqa
+import numpy as np  # noqa
+from dust3r.utils.geometry import (
+    colmap_to_opencv_intrinsics,
+    opencv_to_colmap_intrinsics,
+)  # noqa
+
+try:
+    lanczos = PIL.Image.Resampling.LANCZOS
+    bicubic = PIL.Image.Resampling.BICUBIC
+except AttributeError:
+    lanczos = PIL.Image.LANCZOS
+    bicubic = PIL.Image.BICUBIC
+
+
+class ImageList:
+    """Convenience class to aply the same operation to a whole set of images."""
+
+    def __init__(self, images):
+        if not isinstance(images, (tuple, list, set)):
+            images = [images]
+        self.images = []
+        for image in images:
+            if not isinstance(image, PIL.Image.Image):
+                image = PIL.Image.fromarray(image)
+            self.images.append(image)
+
+    def __len__(self):
+        return len(self.images)
+
+    def to_pil(self):
+        return tuple(self.images) if len(self.images) > 1 else self.images[0]
+
+    @property
+    def size(self):
+        sizes = [im.size for im in self.images]
+        assert all(sizes[0] == s for s in sizes)
+        return sizes[0]
+
+    def resize(self, *args, **kwargs):
+        return ImageList(self._dispatch("resize", *args, **kwargs))
+
+    def crop(self, *args, **kwargs):
+        return ImageList(self._dispatch("crop", *args, **kwargs))
+
+    def _dispatch(self, func, *args, **kwargs):
+        return [getattr(im, func)(*args, **kwargs) for im in self.images]
+
+
+def rescale_image_depthmap(
+    image, depthmap, camera_intrinsics, output_resolution, force=True
+):
+    """Jointly rescale a (image, depthmap)
+    so that (out_width, out_height) >= output_res
+    """
+    image = ImageList(image)
+    input_resolution = np.array(image.size)  # (W,H)
+    output_resolution = np.array(output_resolution)
+    if depthmap is not None:
+        # can also use this with masks instead of depthmaps
+        assert tuple(depthmap.shape[:2]) == image.size[::-1]
+
+    # define output resolution
+    assert output_resolution.shape == (2,)
+    scale_final = max(output_resolution / image.size) + 1e-8
+    if scale_final >= 1 and not force:  # image is already smaller than what is asked
+        return (image.to_pil(), depthmap, camera_intrinsics)
+    output_resolution = np.floor(input_resolution * scale_final).astype(int)
+
+    # first rescale the image so that it contains the crop
+    image = image.resize(
+        output_resolution, resample=lanczos if scale_final < 1 else bicubic
+    )
+    if depthmap is not None:
+        depthmap = cv2.resize(
+            depthmap,
+            output_resolution,
+            fx=scale_final,
+            fy=scale_final,
+            interpolation=cv2.INTER_NEAREST,
+        )
+
+    # no offset here; simple rescaling
+    camera_intrinsics = camera_matrix_of_crop(
+        camera_intrinsics, input_resolution, output_resolution, scaling=scale_final
+    )
+
+    return image.to_pil(), depthmap, camera_intrinsics
+
+
+def camera_matrix_of_crop(
+    input_camera_matrix,
+    input_resolution,
+    output_resolution,
+    scaling=1,
+    offset_factor=0.5,
+    offset=None,
+):
+    # Margins to offset the origin
+    margins = np.asarray(input_resolution) * scaling - output_resolution
+    assert np.all(margins >= 0.0)
+    if offset is None:
+        offset = offset_factor * margins
+
+    # Generate new camera parameters
+    output_camera_matrix_colmap = opencv_to_colmap_intrinsics(input_camera_matrix)
+    output_camera_matrix_colmap[:2, :] *= scaling
+    output_camera_matrix_colmap[:2, 2] -= offset
+    output_camera_matrix = colmap_to_opencv_intrinsics(output_camera_matrix_colmap)
+
+    return output_camera_matrix
+
+
+def crop_image_depthmap(image, depthmap, camera_intrinsics, crop_bbox):
+    """
+    Return a crop of the input view.
+    """
+    image = ImageList(image)
+    l, t, r, b = crop_bbox
+
+    image = image.crop((l, t, r, b))
+    depthmap = depthmap[t:b, l:r]
+
+    camera_intrinsics = camera_intrinsics.copy()
+    camera_intrinsics[0, 2] -= l
+    camera_intrinsics[1, 2] -= t
+
+    return image.to_pil(), depthmap, camera_intrinsics
+
+
+def bbox_from_intrinsics_in_out(
+    input_camera_matrix, output_camera_matrix, output_resolution
+):
+    out_width, out_height = output_resolution
+    l, t = np.int32(np.round(input_camera_matrix[:2, 2] - output_camera_matrix[:2, 2]))
+    crop_bbox = (l, t, l + out_width, t + out_height)
+    return crop_bbox
diff --git a/extern/CUT3R/src/dust3r/datasets/utils/transforms.py b/extern/CUT3R/src/dust3r/datasets/utils/transforms.py
new file mode 100755
index 0000000000000000000000000000000000000000..39a4450e57e3482315e307e72c0f3b19e77dea3b
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/utils/transforms.py
@@ -0,0 +1,80 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# DUST3R default transforms
+# --------------------------------------------------------
+import torchvision.transforms as tvf
+from dust3r.utils.image import ImgNorm
+
+# define the standard image transforms
+ColorJitter = tvf.Compose([tvf.ColorJitter(0.5, 0.5, 0.5, 0.1), ImgNorm])
+
+
+def _check_input(value, center=1, bound=(0, float("inf")), clip_first_on_zero=True):
+    if isinstance(value, (int, float)):
+        if value < 0:
+            raise ValueError(f"If  is a single number, it must be non negative.")
+        value = [center - float(value), center + float(value)]
+        if clip_first_on_zero:
+            value[0] = max(value[0], 0.0)
+    elif isinstance(value, (tuple, list)) and len(value) == 2:
+        value = [float(value[0]), float(value[1])]
+    else:
+        raise TypeError(f"should be a single number or a list/tuple with length 2.")
+
+    if not bound[0] <= value[0] <= value[1] <= bound[1]:
+        raise ValueError(f"values should be between {bound}, but got {value}.")
+
+    # if value is 0 or (1., 1.) for brightness/contrast/saturation
+    # or (0., 0.) for hue, do nothing
+    if value[0] == value[1] == center:
+        return None
+    else:
+        return tuple(value)
+
+
+import torch
+import torchvision.transforms.functional as F
+
+
+def SeqColorJitter():
+    """
+    Return a color jitter transform with same random parameters
+    """
+    brightness = _check_input(0.5)
+    contrast = _check_input(0.5)
+    saturation = _check_input(0.5)
+    hue = _check_input(0.1, center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
+
+    fn_idx = torch.randperm(4)
+    brightness_factor = (
+        None
+        if brightness is None
+        else float(torch.empty(1).uniform_(brightness[0], brightness[1]))
+    )
+    contrast_factor = (
+        None
+        if contrast is None
+        else float(torch.empty(1).uniform_(contrast[0], contrast[1]))
+    )
+    saturation_factor = (
+        None
+        if saturation is None
+        else float(torch.empty(1).uniform_(saturation[0], saturation[1]))
+    )
+    hue_factor = None if hue is None else float(torch.empty(1).uniform_(hue[0], hue[1]))
+
+    def _color_jitter(img):
+        for fn_id in fn_idx:
+            if fn_id == 0 and brightness_factor is not None:
+                img = F.adjust_brightness(img, brightness_factor)
+            elif fn_id == 1 and contrast_factor is not None:
+                img = F.adjust_contrast(img, contrast_factor)
+            elif fn_id == 2 and saturation_factor is not None:
+                img = F.adjust_saturation(img, saturation_factor)
+            elif fn_id == 3 and hue_factor is not None:
+                img = F.adjust_hue(img, hue_factor)
+        return ImgNorm(img)
+
+    return _color_jitter
diff --git a/extern/CUT3R/src/dust3r/datasets/vkitti2.py b/extern/CUT3R/src/dust3r/datasets/vkitti2.py
new file mode 100755
index 0000000000000000000000000000000000000000..438e24f425fdb610b870c4d7b7f02b66ce8e3246
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/vkitti2.py
@@ -0,0 +1,169 @@
+import os.path as osp
+import numpy as np
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class VirtualKITTI2_Multi(BaseMultiViewDataset):
+
+    def __init__(self, ROOT, *args, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 5
+        super().__init__(*args, **kwargs)
+        # loading all
+        self._load_data(self.split)
+
+    def _load_data(self, split=None):
+        scene_dirs = sorted(
+            [
+                d
+                for d in os.listdir(self.ROOT)
+                if os.path.isdir(os.path.join(self.ROOT, d))
+            ]
+        )
+        if split == "train":
+            scene_dirs = scene_dirs[:-1]
+        elif split == "test":
+            scene_dirs = scene_dirs[-1:]
+        seq_dirs = []
+        for scene in scene_dirs:
+            seq_dirs += sorted(
+                [
+                    os.path.join(scene, d)
+                    for d in os.listdir(os.path.join(self.ROOT, scene))
+                    if os.path.isdir(os.path.join(self.ROOT, scene, d))
+                ]
+            )
+        offset = 0
+        scenes = []
+        sceneids = []
+        images = []
+        scene_img_list = []
+        start_img_ids = []
+        j = 0
+
+        for seq_idx, seq in enumerate(seq_dirs):
+            seq_path = osp.join(self.ROOT, seq)
+            for cam in ["Camera_0", "Camera_1"]:
+                basenames = sorted(
+                    [
+                        f[:5]
+                        for f in os.listdir(seq_path + "/" + cam)
+                        if f.endswith(".jpg")
+                    ]
+                )
+                num_imgs = len(basenames)
+                cut_off = (
+                    self.num_views
+                    if not self.allow_repeat
+                    else max(self.num_views // 3, 3)
+                )
+                if num_imgs < cut_off:
+                    print(f"Skipping {scene}")
+                    continue
+                img_ids = list(np.arange(num_imgs) + offset)
+                start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+
+                scenes.append(seq + "/" + cam)
+                scene_img_list.append(img_ids)
+                sceneids.extend([j] * num_imgs)
+                images.extend(basenames)
+                start_img_ids.extend(start_img_ids_)
+                offset += num_imgs
+                j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def get_stats(self):
+        return f"{len(self)} groups of views"
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        scene_id = self.sceneids[start_id]
+        all_image_ids = self.scene_img_list[scene_id]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            video_prob=1.0,
+            fix_interval_prob=0.9,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+
+        views = []
+
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+            basename = self.images[view_idx]
+
+            img = basename + "_rgb.jpg"
+            image = imread_cv2(osp.join(scene_dir, img))
+            depthmap = (
+                cv2.imread(
+                    osp.join(scene_dir, basename + "_depth.png"),
+                    cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH,
+                ).astype(np.float32)
+                / 100.0
+            )
+            camera_params = np.load(osp.join(scene_dir, basename + "_cam.npz"))
+
+            intrinsics = camera_params["camera_intrinsics"]
+            camera_pose = camera_params["camera_pose"]
+
+            sky_mask = depthmap >= 655
+            depthmap[sky_mask] = -1.0  # sky
+
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image, depthmap, intrinsics, resolution, rng, info=(scene_dir, img)
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.85, 0.1, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=camera_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    dataset="VirtualKITTI2",
+                    label=scene_dir,
+                    is_metric=self.is_metric,
+                    instance=scene_dir + "_" + img,
+                    is_video=ordered_video,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/waymo.py b/extern/CUT3R/src/dust3r/datasets/waymo.py
new file mode 100755
index 0000000000000000000000000000000000000000..b7f811f144c638b931cb99fd246702a0fa2d18e7
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/waymo.py
@@ -0,0 +1,178 @@
+import os.path as osp
+import os
+import numpy as np
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+import h5py
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class Waymo_Multi(BaseMultiViewDataset):
+    """Dataset of outdoor street scenes, 5 images each time"""
+
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.max_interval = 8
+        self.video = True
+        self.is_metric = True
+        super().__init__(*args, **kwargs)
+        assert self.split is None
+        self._load_data()
+
+    def load_invalid_dict(self, h5_file_path):
+        invalid_dict = {}
+        with h5py.File(h5_file_path, "r") as h5f:
+            for scene in h5f:
+                data = h5f[scene]["invalid_pairs"][:]
+                invalid_pairs = set(
+                    tuple(pair.decode("utf-8").split("_")) for pair in data
+                )
+                invalid_dict[scene] = invalid_pairs
+        return invalid_dict
+
+    def _load_data(self):
+        invalid_dict = self.load_invalid_dict(
+            os.path.join(self.ROOT, "invalid_files.h5")
+        )
+        scene_dirs = sorted(
+            [
+                d
+                for d in os.listdir(self.ROOT)
+                if os.path.isdir(os.path.join(self.ROOT, d))
+            ]
+        )
+        offset = 0
+        scenes = []
+        sceneids = []
+        images = []
+        start_img_ids = []
+        scene_img_list = []
+        is_video = []
+        j = 0
+
+        for scene in scene_dirs:
+            scene_dir = osp.join(self.ROOT, scene)
+            invalid_pairs = invalid_dict.get(scene, set())
+            seq2frames = {}
+            for f in os.listdir(scene_dir):
+                if not f.endswith(".jpg"):
+                    continue
+                basename = f[:-4]
+                frame_id = basename.split("_")[0]
+                seq_id = basename.split("_")[1]
+                if seq_id == "5":
+                    continue
+                if (seq_id, frame_id) in invalid_pairs:
+                    continue  # Skip invalid files
+                if seq_id not in seq2frames:
+                    seq2frames[seq_id] = []
+                seq2frames[seq_id].append(frame_id)
+
+            for seq_id, frame_ids in seq2frames.items():
+                frame_ids = sorted(frame_ids)
+                num_imgs = len(frame_ids)
+                img_ids = list(np.arange(num_imgs) + offset)
+                cut_off = (
+                    self.num_views
+                    if not self.allow_repeat
+                    else max(self.num_views // 3, 3)
+                )
+                start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+
+                if num_imgs < cut_off:
+                    print(f"Skipping {scene}_{seq_id}")
+                    continue
+
+                scenes.append((scene, seq_id))
+                sceneids.extend([j] * num_imgs)
+                images.extend(frame_ids)
+                start_img_ids.extend(start_img_ids_)
+                scene_img_list.append(img_ids)
+
+                offset += num_imgs
+                j += 1
+
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+        self.is_video = is_video
+
+    def __len__(self):
+        return len(self.start_img_ids)
+
+    def get_image_num(self):
+        return len(self.images)
+
+    def get_stats(self):
+        return f"{len(self)} groups of views"
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        _, seq_id = self.scenes[self.sceneids[start_id]]
+        max_interval = self.max_interval // 2 if seq_id == "4" else self.max_interval
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=max_interval,
+            video_prob=0.9,
+            fix_interval_prob=0.9,
+            block_shuffle=16,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+        views = []
+        ordered_video = True
+
+        views = []
+
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir, seq_id = self.scenes[scene_id]
+            scene_dir = osp.join(self.ROOT, scene_dir)
+            frame_id = self.images[view_idx]
+
+            impath = f"{frame_id}_{seq_id}"
+            image = imread_cv2(osp.join(scene_dir, impath + ".jpg"))
+            depthmap = imread_cv2(osp.join(scene_dir, impath + ".exr"))
+            camera_params = np.load(osp.join(scene_dir, impath + ".npz"))
+
+            intrinsics = np.float32(camera_params["intrinsics"])
+            camera_pose = np.float32(camera_params["cam2world"])
+
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image, depthmap, intrinsics, resolution, rng, info=(scene_dir, impath)
+            )
+
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.85, 0.10, 0.05]
+            )
+
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=camera_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    dataset="Waymo",
+                    label=osp.relpath(scene_dir, self.ROOT),
+                    is_metric=self.is_metric,
+                    instance=osp.join(scene_dir, impath + ".jpg"),
+                    is_video=ordered_video,
+                    quantile=np.array(0.98, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+
+        return views
diff --git a/extern/CUT3R/src/dust3r/datasets/wildrgbd.py b/extern/CUT3R/src/dust3r/datasets/wildrgbd.py
new file mode 100755
index 0000000000000000000000000000000000000000..9ba152e19b9dae9e3ddd254d632f19d779ccffbe
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/datasets/wildrgbd.py
@@ -0,0 +1,56 @@
+import os.path as osp
+import sys
+
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+import cv2
+import numpy as np
+
+from dust3r.datasets.co3d import Co3d_Multi
+from dust3r.utils.image import imread_cv2
+
+
+class WildRGBD_Multi(Co3d_Multi):
+    def __init__(self, mask_bg="rand", *args, ROOT, **kwargs):
+        super().__init__(mask_bg, *args, ROOT=ROOT, **kwargs)
+        self.dataset_label = "WildRGBD"
+        self.is_metric = True
+        # load all scenes
+        self.scenes.pop(("box", "scenes/scene_257"), None)
+        self.scene_list = list(self.scenes.keys())
+        cut_off = (
+            self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+        )
+        self.cut_off = cut_off
+        self.all_ref_imgs = [
+            (key, value)
+            for key, values in self.scenes.items()
+            for value in values[: len(values) - cut_off + 1]
+        ]
+        self.invalidate = {scene: {} for scene in self.scene_list}
+        self.invalid_scenes = {scene: False for scene in self.scene_list}
+
+    def _get_metadatapath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, "metadata", f"{view_idx:0>5d}.npz")
+
+    def _get_impath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, "rgb", f"{view_idx:0>5d}.jpg")
+
+    def _get_depthpath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, "depth", f"{view_idx:0>5d}.png")
+
+    def _get_maskpath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, "masks", f"{view_idx:0>5d}.png")
+
+    def _read_depthmap(self, depthpath, input_metadata):
+        # We store depths in the depth scale of 1000.
+        # That is, when we load depth image and divide by 1000, we could get depth in meters.
+        depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED)
+        depthmap = depthmap.astype(np.float32) / 1000.0
+        return depthmap
+
+    def _get_views(self, idx, resolution, rng, num_views):
+        views = super()._get_views(idx, resolution, rng, num_views)
+        for view in views:
+            assert view["is_metric"]
+            view["quantile"] = np.array(0.96, dtype=np.float32)
+        return views
diff --git a/extern/CUT3R/src/dust3r/heads/__init__.py b/extern/CUT3R/src/dust3r/heads/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..75cfc19c494f4c9faa0c9235864541902c75f4f6
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/heads/__init__.py
@@ -0,0 +1,35 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+from .linear_head import LinearPts3d, LinearPts3d_Desc, LinearPts3dPose
+from .dpt_head import DPTPts3dPose
+
+
+def head_factory(
+    head_type,
+    output_mode,
+    net,
+    has_conf=False,
+    has_depth=False,
+    has_rgb=False,
+    has_pose_conf=False,
+    has_pose=False,
+):
+    """ " build a prediction head for the decoder"""
+    if head_type == "linear" and output_mode == "pts3d":
+        return LinearPts3d(net, has_conf, has_depth, has_rgb, has_pose_conf)
+    elif head_type == "linear" and output_mode == "pts3d+pose":
+        return LinearPts3dPose(net, has_conf, has_rgb, has_pose)
+    elif head_type == "linear" and output_mode.startswith("pts3d+desc"):
+        local_feat_dim = int(output_mode[10:])
+        return LinearPts3d_Desc(net, has_conf, has_depth, local_feat_dim)
+    elif head_type == "dpt" and output_mode == "pts3d":
+        raise NotImplementedError(f"unexpected {head_type=} and {output_mode=}")
+        return create_dpt_head(net, has_conf=has_conf)
+    elif head_type == "dpt" and output_mode == "pts3d+pose":
+        return DPTPts3dPose(net, has_conf, has_rgb, has_pose)
+    else:
+        raise NotImplementedError(f"unexpected {head_type=} and {output_mode=}")
diff --git a/extern/CUT3R/src/dust3r/heads/dpt_head.py b/extern/CUT3R/src/dust3r/heads/dpt_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..11bd08ed69679c09770a728d98afb6a1b1bc1cf3
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/heads/dpt_head.py
@@ -0,0 +1,260 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+from einops import rearrange
+from typing import List
+import torch
+import torch.nn as nn
+from dust3r.heads.postprocess import (
+    postprocess,
+    postprocess_desc,
+    postprocess_rgb,
+    postprocess_pose_conf,
+    postprocess_pose,
+    reg_dense_conf,
+)
+import dust3r.utils.path_to_croco  # noqa: F401
+from models.dpt_block import DPTOutputAdapter  # noqa
+from dust3r.utils.camera import pose_encoding_to_camera, PoseDecoder
+from dust3r.blocks import ConditionModulationBlock
+from torch.utils.checkpoint import checkpoint
+
+
+class DPTOutputAdapter_fix(DPTOutputAdapter):
+    """
+    Adapt croco's DPTOutputAdapter implementation for dust3r:
+    remove duplicated weigths, and fix forward for dust3r
+    """
+
+    def init(self, dim_tokens_enc=768):
+        super().init(dim_tokens_enc)
+
+        del self.act_1_postprocess
+        del self.act_2_postprocess
+        del self.act_3_postprocess
+        del self.act_4_postprocess
+
+    def forward(self, encoder_tokens: List[torch.Tensor], image_size=None):
+        assert (
+            self.dim_tokens_enc is not None
+        ), "Need to call init(dim_tokens_enc) function first"
+
+        image_size = self.image_size if image_size is None else image_size
+        H, W = image_size
+
+        N_H = H // (self.stride_level * self.P_H)
+        N_W = W // (self.stride_level * self.P_W)
+
+        layers = [encoder_tokens[hook] for hook in self.hooks]
+
+        layers = [self.adapt_tokens(l) for l in layers]
+
+        layers = [
+            rearrange(l, "b (nh nw) c -> b c nh nw", nh=N_H, nw=N_W) for l in layers
+        ]
+
+        layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)]
+
+        layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)]
+
+        path_4 = self.scratch.refinenet4(layers[3])[
+            :, :, : layers[2].shape[2], : layers[2].shape[3]
+        ]
+        path_3 = self.scratch.refinenet3(path_4, layers[2])
+        path_2 = self.scratch.refinenet2(path_3, layers[1])
+        path_1 = self.scratch.refinenet1(path_2, layers[0])
+
+        out = self.head(path_1)
+
+        return out
+
+
+class PixelwiseTaskWithDPT(nn.Module):
+    """DPT module for dust3r, can return 3D points + confidence for all pixels"""
+
+    def __init__(
+        self,
+        *,
+        n_cls_token=0,
+        hooks_idx=None,
+        dim_tokens=None,
+        output_width_ratio=1,
+        num_channels=1,
+        postprocess=None,
+        depth_mode=None,
+        conf_mode=None,
+        **kwargs
+    ):
+        super(PixelwiseTaskWithDPT, self).__init__()
+        self.return_all_layers = True  # backbone needs to return all layers
+        self.postprocess = postprocess
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+
+        assert n_cls_token == 0, "Not implemented"
+        dpt_args = dict(
+            output_width_ratio=output_width_ratio, num_channels=num_channels, **kwargs
+        )
+        if hooks_idx is not None:
+            dpt_args.update(hooks=hooks_idx)
+        self.dpt = DPTOutputAdapter_fix(**dpt_args)
+        dpt_init_args = {} if dim_tokens is None else {"dim_tokens_enc": dim_tokens}
+        self.dpt.init(**dpt_init_args)
+
+    def forward(self, x, img_info):
+        out = self.dpt(x, image_size=(img_info[0], img_info[1]))
+        if self.postprocess:
+            out = self.postprocess(out, self.depth_mode, self.conf_mode)
+        return out
+
+
+def create_dpt_head(net, has_conf=False):
+    """
+    return PixelwiseTaskWithDPT for given net params
+    """
+    assert net.dec_depth > 9
+    l2 = net.dec_depth
+    feature_dim = 256
+    last_dim = feature_dim // 2
+    out_nchan = 3
+    ed = net.enc_embed_dim
+    dd = net.dec_embed_dim
+    return PixelwiseTaskWithDPT(
+        num_channels=out_nchan + has_conf,
+        feature_dim=feature_dim,
+        last_dim=last_dim,
+        hooks_idx=[0, l2 * 2 // 4, l2 * 3 // 4, l2],
+        dim_tokens=[ed, dd, dd, dd],
+        postprocess=postprocess,
+        depth_mode=net.depth_mode,
+        conf_mode=net.conf_mode,
+        head_type="regression",
+    )
+
+
+class DPTPts3dPose(nn.Module):
+    def __init__(self, net, has_conf=False, has_rgb=False, has_pose=False):
+        super(DPTPts3dPose, self).__init__()
+        self.return_all_layers = True  # backbone needs to return all layers
+        self.depth_mode = net.depth_mode
+        self.conf_mode = net.conf_mode
+        self.pose_mode = net.pose_mode
+
+        self.has_conf = has_conf
+        self.has_rgb = has_rgb
+        self.has_pose = has_pose
+
+        pts_channels = 3 + has_conf
+        rgb_channels = has_rgb * 3
+        feature_dim = 256
+        last_dim = feature_dim // 2
+        ed = net.enc_embed_dim
+        dd = net.dec_embed_dim
+        hooks_idx = [0, 1, 2, 3]
+        dim_tokens = [ed, dd, dd, dd]
+        head_type = "regression"
+        output_width_ratio = 1
+
+        pts_dpt_args = dict(
+            output_width_ratio=output_width_ratio,
+            num_channels=pts_channels,
+            feature_dim=feature_dim,
+            last_dim=last_dim,
+            dim_tokens=dim_tokens,
+            hooks_idx=hooks_idx,
+            head_type=head_type,
+        )
+        rgb_dpt_args = dict(
+            output_width_ratio=output_width_ratio,
+            num_channels=rgb_channels,
+            feature_dim=feature_dim,
+            last_dim=last_dim,
+            dim_tokens=dim_tokens,
+            hooks_idx=hooks_idx,
+            head_type=head_type,
+        )
+        if hooks_idx is not None:
+            pts_dpt_args.update(hooks=hooks_idx)
+            rgb_dpt_args.update(hooks=hooks_idx)
+
+        self.dpt_self = DPTOutputAdapter_fix(**pts_dpt_args)
+        dpt_init_args = {} if dim_tokens is None else {"dim_tokens_enc": dim_tokens}
+        self.dpt_self.init(**dpt_init_args)
+
+        self.final_transform = nn.ModuleList(
+            [
+                ConditionModulationBlock(
+                    net.dec_embed_dim,
+                    net.dec_num_heads,
+                    mlp_ratio=4.0,
+                    qkv_bias=True,
+                    rope=net.rope,
+                )
+                for _ in range(2)
+            ]
+        )
+
+        self.dpt_cross = DPTOutputAdapter_fix(**pts_dpt_args)
+        dpt_init_args = {} if dim_tokens is None else {"dim_tokens_enc": dim_tokens}
+        self.dpt_cross.init(**dpt_init_args)
+
+        if has_rgb:
+            self.dpt_rgb = DPTOutputAdapter_fix(**rgb_dpt_args)
+            dpt_init_args = {} if dim_tokens is None else {"dim_tokens_enc": dim_tokens}
+            self.dpt_rgb.init(**dpt_init_args)
+
+        if has_pose:
+            in_dim = net.dec_embed_dim
+            self.pose_head = PoseDecoder(hidden_size=in_dim)
+
+    def forward(self, x, img_info, **kwargs):
+        if self.has_pose:
+            pose_token = x[-1][:, 0].clone()
+            token = x[-1][:, 1:]
+            with torch.cuda.amp.autocast(enabled=False):
+                pose = self.pose_head(pose_token)
+
+            token_cross = token.clone()
+            for blk in self.final_transform:
+                token_cross = blk(token_cross, pose_token, kwargs.get("pos"))
+            x = x[:-1] + [token]
+            x_cross = x[:-1] + [token_cross]
+
+        with torch.cuda.amp.autocast(enabled=False):
+            self_out = checkpoint(
+                self.dpt_self,
+                x,
+                image_size=(img_info[0], img_info[1]),
+                use_reentrant=False,
+            )
+
+            final_output = postprocess(self_out, self.depth_mode, self.conf_mode)
+            final_output["pts3d_in_self_view"] = final_output.pop("pts3d")
+            final_output["conf_self"] = final_output.pop("conf")
+
+            if self.has_rgb:
+                rgb_out = checkpoint(
+                    self.dpt_rgb,
+                    x,
+                    image_size=(img_info[0], img_info[1]),
+                    use_reentrant=False,
+                )
+                rgb_output = postprocess_rgb(rgb_out)
+                final_output.update(rgb_output)
+
+            if self.has_pose:
+                pose = postprocess_pose(pose, self.pose_mode)
+                final_output["camera_pose"] = pose  # B,7
+                cross_out = checkpoint(
+                    self.dpt_cross,
+                    x_cross,
+                    image_size=(img_info[0], img_info[1]),
+                    use_reentrant=False,
+                )
+                tmp = postprocess(cross_out, self.depth_mode, self.conf_mode)
+                final_output["pts3d_in_other_view"] = tmp.pop("pts3d")
+                final_output["conf"] = tmp.pop("conf")
+        return final_output
diff --git a/extern/CUT3R/src/dust3r/heads/linear_head.py b/extern/CUT3R/src/dust3r/heads/linear_head.py
new file mode 100755
index 0000000000000000000000000000000000000000..081cf21de8252c9ed51882cedf2ecae0c8364985
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/heads/linear_head.py
@@ -0,0 +1,346 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dust3r.heads.postprocess import (
+    postprocess,
+    postprocess_desc,
+    postprocess_rgb,
+    postprocess_pose_conf,
+    postprocess_pose,
+    reg_dense_conf,
+)
+import dust3r.utils.path_to_croco  # noqa
+from models.blocks import Mlp  # noqa
+from dust3r.utils.geometry import geotrf
+from dust3r.utils.camera import pose_encoding_to_camera, PoseDecoder
+from dust3r.blocks import ConditionModulationBlock
+
+
+class LinearPts3d(nn.Module):
+    """
+    Linear head for dust3r
+    Each token outputs: - 16x16 3D points (+ confidence)
+    """
+
+    def __init__(
+        self, net, has_conf=False, has_depth=False, has_rgb=False, has_pose_conf=False
+    ):
+        super().__init__()
+        self.patch_size = net.patch_embed.patch_size[0]
+        self.depth_mode = net.depth_mode
+        self.conf_mode = net.conf_mode
+        self.has_conf = has_conf
+        self.has_rgb = has_rgb
+        self.has_pose_conf = has_pose_conf
+        self.has_depth = has_depth
+        self.proj = Mlp(
+            net.dec_embed_dim, out_features=(3 + has_conf) * self.patch_size**2
+        )
+        if has_depth:
+            self.self_proj = Mlp(
+                net.dec_embed_dim, out_features=(3 + has_conf) * self.patch_size**2
+            )
+        if has_rgb:
+            self.rgb_proj = Mlp(net.dec_embed_dim, out_features=3 * self.patch_size**2)
+
+    def setup(self, croconet):
+        pass
+
+    def forward(self, decout, img_shape):
+        H, W = img_shape
+        tokens = decout[-1]
+        B, S, D = tokens.shape
+
+        feat = self.proj(tokens)  # B,S,D
+        feat = feat.transpose(-1, -2).view(
+            B, -1, H // self.patch_size, W // self.patch_size
+        )
+        feat = F.pixel_shuffle(feat, self.patch_size)  # B,3,H,W
+
+        final_output = postprocess(feat, self.depth_mode, self.conf_mode)
+        final_output["pts3d_in_other_view"] = final_output.pop("pts3d")
+
+        if self.has_depth:
+            self_feat = self.self_proj(tokens)  # B,S,D
+            self_feat = self_feat.transpose(-1, -2).view(
+                B, -1, H // self.patch_size, W // self.patch_size
+            )
+            self_feat = F.pixel_shuffle(self_feat, self.patch_size)  # B,3,H,W
+            self_3d_output = postprocess(self_feat, self.depth_mode, self.conf_mode)
+            self_3d_output["pts3d_in_self_view"] = self_3d_output.pop("pts3d")
+            self_3d_output["conf_self"] = self_3d_output.pop("conf")
+            final_output.update(self_3d_output)
+
+        if self.has_rgb:
+            rgb_feat = self.rgb_proj(tokens)
+            rgb_feat = rgb_feat.transpose(-1, -2).view(
+                B, -1, H // self.patch_size, W // self.patch_size
+            )
+            rgb_feat = F.pixel_shuffle(rgb_feat, self.patch_size)  # B,3,H,W
+            rgb_output = postprocess_rgb(rgb_feat)
+            final_output.update(rgb_output)
+
+        if self.has_pose_conf:
+            pose_conf = self.pose_conf_proj(tokens)
+            pose_conf = pose_conf.transpose(-1, -2).view(
+                B, -1, H // self.patch_size, W // self.patch_size
+            )
+            pose_conf = F.pixel_shuffle(pose_conf, self.patch_size)
+            pose_conf_output = postprocess_pose_conf(pose_conf)
+            final_output.update(pose_conf_output)
+
+        return final_output
+
+
+class LinearPts3d_Desc(nn.Module):
+    """
+    Linear head for dust3r
+    Each token outputs: - 16x16 3D points (+ confidence)
+    """
+
+    def __init__(
+        self,
+        net,
+        has_conf=False,
+        has_depth=False,
+        local_feat_dim=24,
+        hidden_dim_factor=4.0,
+    ):
+        super().__init__()
+        self.patch_size = net.patch_embed.patch_size[0]
+        self.depth_mode = net.depth_mode
+        self.conf_mode = net.conf_mode
+        self.has_conf = has_conf
+        self.double_channel = has_depth
+        self.local_feat_dim = local_feat_dim
+
+        if not has_depth:
+            self.proj = nn.Linear(
+                net.dec_embed_dim, (3 + has_conf) * self.patch_size**2
+            )
+        else:
+            self.proj = nn.Linear(
+                net.dec_embed_dim, (3 + has_conf) * 2 * self.patch_size**2
+            )
+        idim = net.enc_embed_dim + net.dec_embed_dim
+        self.head_local_features = Mlp(
+            in_features=idim,
+            hidden_features=int(hidden_dim_factor * idim),
+            out_features=(self.local_feat_dim + 1) * self.patch_size**2,
+        )
+
+    def setup(self, croconet):
+        pass
+
+    def forward(self, decout, img_shape):
+        H, W = img_shape
+        tokens = decout[-1]
+        B, S, D = tokens.shape
+
+        feat = self.proj(tokens)  # B,S,D
+        feat = feat.transpose(-1, -2).view(
+            B, -1, H // self.patch_size, W // self.patch_size
+        )
+        feat = F.pixel_shuffle(feat, self.patch_size)  # B,3,H,W
+
+        enc_output, dec_output = decout[0], decout[-1]
+        cat_output = torch.cat([enc_output, dec_output], dim=-1)
+        local_features = self.head_local_features(cat_output)  # B,S,D
+        local_features = local_features.transpose(-1, -2).view(
+            B, -1, H // self.patch_size, W // self.patch_size
+        )
+        local_features = F.pixel_shuffle(local_features, self.patch_size)  # B,d,H,W
+        feat = torch.cat([feat, local_features], dim=1)
+
+        return postprocess_desc(
+            feat,
+            self.depth_mode,
+            self.conf_mode,
+            self.local_feat_dim,
+            self.double_channel,
+        )
+
+
+class LinearPts3dPoseDirect(nn.Module):
+    """
+    Linear head for dust3r
+    Each token outputs: - 16x16 3D points (+ confidence)
+    """
+
+    def __init__(self, net, has_conf=False, has_rgb=False, has_pose=False):
+        super().__init__()
+        self.patch_size = net.patch_embed.patch_size[0]
+        self.depth_mode = net.depth_mode
+        self.conf_mode = net.conf_mode
+        self.pose_mode = net.pose_mode
+        self.has_conf = has_conf
+        self.has_rgb = has_rgb
+        self.has_pose = has_pose
+
+        self.proj = Mlp(
+            net.dec_embed_dim, out_features=(3 + has_conf) * self.patch_size**2
+        )
+        if has_rgb:
+            self.rgb_proj = Mlp(net.dec_embed_dim, out_features=3 * self.patch_size**2)
+        if has_pose:
+            self.pose_head = PoseDecoder(hidden_size=net.dec_embed_dim)
+        if has_conf:
+            self.cross_conf_proj = Mlp(
+                net.dec_embed_dim, out_features=self.patch_size**2
+            )
+
+    def setup(self, croconet):
+        pass
+
+    def forward(self, decout, img_shape):
+        H, W = img_shape
+        tokens = decout[-1]
+        if self.has_pose:
+            pose_token = tokens[:, 0]
+            tokens = tokens[:, 1:]
+        B, S, D = tokens.shape
+
+        feat = self.proj(tokens)  # B,S,D
+        feat = feat.transpose(-1, -2).view(
+            B, -1, H // self.patch_size, W // self.patch_size
+        )
+        feat = F.pixel_shuffle(feat, self.patch_size)  # B,3,H,W
+        final_output = postprocess(feat, self.depth_mode, self.conf_mode)
+        final_output["pts3d_in_self_view"] = final_output.pop("pts3d")
+        final_output["conf_self"] = final_output.pop("conf")
+
+        if self.has_rgb:
+            rgb_feat = self.rgb_proj(tokens)
+            rgb_feat = rgb_feat.transpose(-1, -2).view(
+                B, -1, H // self.patch_size, W // self.patch_size
+            )
+            rgb_feat = F.pixel_shuffle(rgb_feat, self.patch_size)  # B,3,H,W
+            rgb_output = postprocess_rgb(rgb_feat)
+            final_output.update(rgb_output)
+
+        if self.has_pose:
+            pose = self.pose_head(pose_token)
+            pose = postprocess_pose(pose, self.pose_mode)
+            final_output["camera_pose"] = pose  # B,7
+            final_output["pts3d_in_other_view"] = geotrf(
+                pose_encoding_to_camera(final_output["camera_pose"]),
+                final_output["pts3d_in_self_view"],
+            )
+
+        if self.has_conf:
+            cross_conf = self.cross_conf_proj(tokens)
+            cross_conf = cross_conf.transpose(-1, -2).view(
+                B, -1, H // self.patch_size, W // self.patch_size
+            )
+            cross_conf = F.pixel_shuffle(cross_conf, self.patch_size)[:, 0]
+            final_output["conf"] = reg_dense_conf(cross_conf, mode=self.conf_mode)
+        return final_output
+
+
+class LinearPts3dPose(nn.Module):
+    """
+    Linear head for dust3r
+    Each token outputs: - 16x16 3D points (+ confidence)
+    """
+
+    def __init__(
+        self, net, has_conf=False, has_rgb=False, has_pose=False, mlp_ratio=4.0
+    ):
+        super().__init__()
+        self.patch_size = net.patch_embed.patch_size[0]
+        self.depth_mode = net.depth_mode
+        self.conf_mode = net.conf_mode
+        self.pose_mode = net.pose_mode
+        self.has_conf = has_conf
+        self.has_rgb = has_rgb
+        self.has_pose = has_pose
+
+        self.proj = Mlp(
+            net.dec_embed_dim,
+            hidden_features=int(mlp_ratio * net.dec_embed_dim),
+            out_features=(3 + has_conf) * self.patch_size**2,
+        )
+        if has_rgb:
+            self.rgb_proj = Mlp(
+                net.dec_embed_dim,
+                hidden_features=int(mlp_ratio * net.dec_embed_dim),
+                out_features=3 * self.patch_size**2,
+            )
+        if has_pose:
+            self.pose_head = PoseDecoder(hidden_size=net.dec_embed_dim)
+            self.final_transform = nn.ModuleList(
+                [
+                    ConditionModulationBlock(
+                        net.dec_embed_dim,
+                        net.dec_num_heads,
+                        mlp_ratio=4.0,
+                        qkv_bias=True,
+                        rope=net.rope,
+                    )
+                    for _ in range(2)
+                ]
+            )
+            self.cross_proj = Mlp(
+                net.dec_embed_dim,
+                hidden_features=int(mlp_ratio * net.dec_embed_dim),
+                out_features=(3 + has_conf) * self.patch_size**2,
+            )
+
+    def setup(self, croconet):
+        pass
+
+    def forward(self, decout, img_shape, **kwargs):
+        H, W = img_shape
+        tokens = decout[-1]
+        if self.has_pose:
+            pose_token = tokens[:, 0]
+            tokens = tokens[:, 1:]
+            with torch.cuda.amp.autocast(enabled=False):
+                pose = self.pose_head(pose_token)
+            cross_tokens = tokens
+            for blk in self.final_transform:
+                cross_tokens = blk(cross_tokens, pose_token, kwargs.get("pos"))
+
+        with torch.cuda.amp.autocast(enabled=False):
+            B, S, D = tokens.shape
+
+            feat = self.proj(tokens)  # B,S,D
+            feat = feat.transpose(-1, -2).view(
+                B, -1, H // self.patch_size, W // self.patch_size
+            )
+            feat = F.pixel_shuffle(feat, self.patch_size)  # B,3,H,W
+            final_output = postprocess(
+                feat, self.depth_mode, self.conf_mode, pos_z=True
+            )
+            final_output["pts3d_in_self_view"] = final_output.pop("pts3d")
+            final_output["conf_self"] = final_output.pop("conf")
+
+            if self.has_rgb:
+                rgb_feat = self.rgb_proj(tokens)
+                rgb_feat = rgb_feat.transpose(-1, -2).view(
+                    B, -1, H // self.patch_size, W // self.patch_size
+                )
+                rgb_feat = F.pixel_shuffle(rgb_feat, self.patch_size)  # B,3,H,W
+                rgb_output = postprocess_rgb(rgb_feat)
+                final_output.update(rgb_output)
+
+            if self.has_pose:
+                pose = postprocess_pose(pose, self.pose_mode)
+                final_output["camera_pose"] = pose  # B,7
+
+                cross_feat = self.cross_proj(cross_tokens)  # B,S,D
+                cross_feat = cross_feat.transpose(-1, -2).view(
+                    B, -1, H // self.patch_size, W // self.patch_size
+                )
+                cross_feat = F.pixel_shuffle(cross_feat, self.patch_size)  # B,3,H,W
+                tmp = postprocess(cross_feat, self.depth_mode, self.conf_mode)
+                final_output["pts3d_in_other_view"] = tmp.pop("pts3d")
+                final_output["conf"] = tmp.pop("conf")
+
+            return final_output
diff --git a/extern/CUT3R/src/dust3r/heads/postprocess.py b/extern/CUT3R/src/dust3r/heads/postprocess.py
new file mode 100755
index 0000000000000000000000000000000000000000..63cf3211b4b2dc5a9782c1d1d53eff17886d54cd
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/heads/postprocess.py
@@ -0,0 +1,167 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+import torch
+import torch.nn.functional as F
+
+
+def postprocess(out, depth_mode, conf_mode, pos_z=False):
+    """
+    extract 3D points/confidence from prediction head output
+    """
+    fmap = out.permute(0, 2, 3, 1)  # B,H,W,3
+    res = dict(pts3d=reg_dense_depth(fmap[:, :, :, 0:3], mode=depth_mode, pos_z=pos_z))
+
+    if conf_mode is not None:
+        res["conf"] = reg_dense_conf(fmap[:, :, :, 3], mode=conf_mode)
+    return res
+
+
+def postprocess_rgb(out, eps=1e-6):
+    fmap = out.permute(0, 2, 3, 1)  # B,H,W,3
+    res = torch.sigmoid(fmap) * (1 - 2 * eps) + eps
+    res = (res - 0.5) * 2
+    return dict(rgb=res)
+
+
+def postprocess_pose(out, mode, inverse=False):
+    """
+    extract pose from prediction head output
+    """
+    mode, vmin, vmax = mode
+
+    no_bounds = (vmin == -float("inf")) and (vmax == float("inf"))
+    assert no_bounds
+    trans = out[..., 0:3]
+    quats = out[..., 3:7]
+
+    if mode == "linear":
+        if no_bounds:
+            return trans  # [-inf, +inf]
+        return trans.clip(min=vmin, max=vmax)
+
+    d = trans.norm(dim=-1, keepdim=True)
+
+    if mode == "square":
+        if inverse:
+            scale = d / d.square().clip(min=1e-8)
+        else:
+            scale = d.square() / d.clip(min=1e-8)
+
+    if mode == "exp":
+        if inverse:
+            scale = d / torch.expm1(d).clip(min=1e-8)
+        else:
+            scale = torch.expm1(d) / d.clip(min=1e-8)
+
+    trans = trans * scale
+    quats = standardize_quaternion(quats)
+
+    return torch.cat([trans, quats], dim=-1)
+
+
+def postprocess_pose_conf(out):
+    fmap = out.permute(0, 2, 3, 1)  # B,H,W,1
+    return dict(pose_conf=torch.sigmoid(fmap))
+
+
+def postprocess_desc(out, depth_mode, conf_mode, desc_dim, double_channel=False):
+    """
+    extract 3D points/confidence from prediction head output
+    """
+    fmap = out.permute(0, 2, 3, 1)  # B,H,W,3
+    res = dict(pts3d=reg_dense_depth(fmap[:, :, :, 0:3], mode=depth_mode))
+
+    if conf_mode is not None:
+        res["conf"] = reg_dense_conf(fmap[:, :, :, 3], mode=conf_mode)
+
+    if double_channel:
+        res["pts3d_self"] = reg_dense_depth(
+            fmap[
+                :, :, :, 3 + int(conf_mode is not None) : 6 + int(conf_mode is not None)
+            ],
+            mode=depth_mode,
+        )
+        if conf_mode is not None:
+            res["conf_self"] = reg_dense_conf(
+                fmap[:, :, :, 6 + int(conf_mode is not None)], mode=conf_mode
+            )
+
+    start = (
+        3
+        + int(conf_mode is not None)
+        + int(double_channel) * (3 + int(conf_mode is not None))
+    )
+    res["desc"] = reg_desc(fmap[:, :, :, start : start + desc_dim], mode="norm")
+    res["desc_conf"] = reg_dense_conf(fmap[:, :, :, start + desc_dim], mode=conf_mode)
+    assert start + desc_dim + 1 == fmap.shape[-1]
+
+    return res
+
+
+def reg_desc(desc, mode="norm"):
+    if "norm" in mode:
+        desc = desc / desc.norm(dim=-1, keepdim=True)
+    else:
+        raise ValueError(f"Unknown desc mode {mode}")
+    return desc
+
+
+def reg_dense_depth(xyz, mode, pos_z=False):
+    """
+    extract 3D points from prediction head output
+    """
+    mode, vmin, vmax = mode
+
+    no_bounds = (vmin == -float("inf")) and (vmax == float("inf"))
+    assert no_bounds
+
+    if mode == "linear":
+        if no_bounds:
+            return xyz  # [-inf, +inf]
+        return xyz.clip(min=vmin, max=vmax)
+
+    if pos_z:
+        sign = torch.sign(xyz[..., -1:])
+        xyz *= sign
+    d = xyz.norm(dim=-1, keepdim=True)
+    xyz = xyz / d.clip(min=1e-8)
+
+    if mode == "square":
+        return xyz * d.square()
+
+    if mode == "exp":
+        return xyz * torch.expm1(d)
+
+    raise ValueError(f"bad {mode=}")
+
+
+def reg_dense_conf(x, mode):
+    """
+    extract confidence from prediction head output
+    """
+    mode, vmin, vmax = mode
+    if mode == "exp":
+        return vmin + x.exp().clip(max=vmax - vmin)
+    if mode == "sigmoid":
+        return (vmax - vmin) * torch.sigmoid(x) + vmin
+    raise ValueError(f"bad {mode=}")
+
+
+def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+
+    Args:
+        quaternions: Quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    quaternions = F.normalize(quaternions, p=2, dim=-1)
+    return torch.where(quaternions[..., 0:1] < 0, -quaternions, quaternions)
diff --git a/extern/CUT3R/src/dust3r/inference.py b/extern/CUT3R/src/dust3r/inference.py
new file mode 100755
index 0000000000000000000000000000000000000000..9d7a1870152f682a4f708fe33fe301174d926134
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/inference.py
@@ -0,0 +1,387 @@
+import tqdm
+import torch
+from dust3r.utils.device import to_cpu, collate_with_cat
+from dust3r.utils.misc import invalid_to_nans
+from dust3r.utils.geometry import depthmap_to_pts3d, geotrf
+from dust3r.model import ARCroco3DStereo
+from accelerate import Accelerator
+import re
+
+
+def custom_sort_key(key):
+    text = key.split("/")
+    if len(text) > 1:
+        text, num = text[0], text[-1]
+        return (text, int(num))
+    else:
+        return (key, -1)
+
+
+def merge_chunk_dict(old_dict, curr_dict, add_number):
+    new_dict = {}
+    for key, value in curr_dict.items():
+
+        match = re.search(r"(\d+)$", key)
+        if match:
+
+            num_part = int(match.group()) + add_number
+
+            new_key = re.sub(r"(\d+)$", str(num_part), key, 1)
+            new_dict[new_key] = value
+        else:
+            new_dict[key] = value
+    new_dict = old_dict | new_dict
+    return {k: new_dict[k] for k in sorted(new_dict.keys(), key=custom_sort_key)}
+
+
+def _interleave_imgs(img1, img2):
+    res = {}
+    for key, value1 in img1.items():
+        value2 = img2[key]
+        if isinstance(value1, torch.Tensor):
+            value = torch.stack((value1, value2), dim=1).flatten(0, 1)
+        else:
+            value = [x for pair in zip(value1, value2) for x in pair]
+        res[key] = value
+    return res
+
+
+def make_batch_symmetric(batch):
+    view1, view2 = batch
+    view1, view2 = (_interleave_imgs(view1, view2), _interleave_imgs(view2, view1))
+    return view1, view2
+
+
+def loss_of_one_batch(
+    batch,
+    model,
+    criterion,
+    accelerator: Accelerator,
+    symmetrize_batch=False,
+    use_amp=False,
+    ret=None,
+    img_mask=None,
+    inference=False,
+):
+    if len(batch) > 2:
+        assert (
+            symmetrize_batch is False
+        ), "cannot symmetrize batch with more than 2 views"
+    if symmetrize_batch:
+        batch = make_batch_symmetric(batch)
+
+    with torch.cuda.amp.autocast(enabled=not inference):
+        if inference:
+            output, state_args = model(batch, ret_state=True)
+            preds, batch = output.ress, output.views
+            result = dict(views=batch, pred=preds)
+            return result[ret] if ret else result, state_args
+        else:
+            output = model(batch)
+            preds, batch = output.ress, output.views
+
+        with torch.cuda.amp.autocast(enabled=False):
+            loss = criterion(batch, preds) if criterion is not None else None
+
+    result = dict(views=batch, pred=preds, loss=loss)
+    return result[ret] if ret else result
+
+
+def loss_of_one_batch_tbptt(
+    batch,
+    model,
+    criterion,
+    chunk_size,
+    loss_scaler,
+    optimizer,
+    accelerator: Accelerator,
+    log_writer=None,
+    symmetrize_batch=False,
+    use_amp=False,
+    ret=None,
+    img_mask=None,
+    inference=False,
+):
+    if len(batch) > 2:
+        assert (
+            symmetrize_batch is False
+        ), "cannot symmetrize batch with more than 2 views"
+    if symmetrize_batch:
+        batch = make_batch_symmetric(batch)
+    all_preds = []
+    all_loss = 0.0
+    all_loss_details = {}
+    with torch.cuda.amp.autocast(enabled=not inference):
+        with torch.no_grad():
+            (feat, pos, shape), (
+                init_state_feat,
+                init_mem,
+                state_feat,
+                state_pos,
+                mem,
+            ) = accelerator.unwrap_model(model)._forward_encoder(batch)
+        feat = [f.detach() for f in feat]
+        pos = [p.detach() for p in pos]
+        shape = [s.detach() for s in shape]
+        init_state_feat = init_state_feat.detach()
+        init_mem = init_mem.detach()
+
+        for chunk_id in range((len(batch) - 1) // chunk_size + 1):
+            preds = []
+            chunk = []
+            state_feat = state_feat.detach()
+            state_pos = state_pos.detach()
+            mem = mem.detach()
+            if chunk_id < ((len(batch) - 1) // chunk_size + 1) - 4:
+                with torch.no_grad():
+                    for in_chunk_idx in range(chunk_size):
+                        i = chunk_id * chunk_size + in_chunk_idx
+                        if i >= len(batch):
+                            break
+                        res, (state_feat, mem) = accelerator.unwrap_model(
+                            model
+                        )._forward_decoder_step(
+                            batch,
+                            i,
+                            feat_i=feat[i],
+                            pos_i=pos[i],
+                            shape_i=shape[i],
+                            init_state_feat=init_state_feat,
+                            init_mem=init_mem,
+                            state_feat=state_feat,
+                            state_pos=state_pos,
+                            mem=mem,
+                        )
+                        preds.append(res)
+                        all_preds.append({k: v.detach() for k, v in res.items()})
+                        chunk.append(batch[i])
+                with torch.cuda.amp.autocast(enabled=False):
+                    loss, loss_details = (
+                        criterion(chunk, preds, camera1=batch[0]["camera_pose"])
+                        if criterion is not None
+                        else None
+                    )
+                    all_loss += float(loss)
+                    all_loss_details = merge_chunk_dict(
+                        all_loss_details, loss_details, chunk_id * chunk_size
+                    )
+                    del loss
+            else:
+                for in_chunk_idx in range(chunk_size):
+                    i = chunk_id * chunk_size + in_chunk_idx
+                    if i >= len(batch):
+                        break
+                    res, (state_feat, mem) = accelerator.unwrap_model(
+                        model
+                    )._forward_decoder_step(
+                        batch,
+                        i,
+                        feat_i=feat[i],
+                        pos_i=pos[i],
+                        shape_i=shape[i],
+                        init_state_feat=init_state_feat,
+                        init_mem=init_mem,
+                        state_feat=state_feat,
+                        state_pos=state_pos,
+                        mem=mem,
+                    )
+                    preds.append(res)
+                    all_preds.append({k: v.detach() for k, v in res.items()})
+                    chunk.append(batch[i])
+                with torch.cuda.amp.autocast(enabled=False):
+                    loss, loss_details = (
+                        criterion(chunk, preds, camera1=batch[0]["camera_pose"])
+                        if criterion is not None
+                        else None
+                    )
+                    all_loss += float(loss)
+                    all_loss_details = merge_chunk_dict(
+                        all_loss_details, loss_details, chunk_id * chunk_size
+                    )
+                    loss_scaler(
+                        loss,
+                        optimizer,
+                        parameters=model.parameters(),
+                        update_grad=True,
+                        clip_grad=1.0,
+                    )
+                    optimizer.zero_grad()
+                    del loss
+    result = dict(
+        views=batch,
+        pred=all_preds,
+        loss=(all_loss / ((len(batch) - 1) // chunk_size + 1), all_loss_details),
+        already_backprop=True,
+    )
+    return result[ret] if ret else result
+
+
+@torch.no_grad()
+def inference(groups, model, device, verbose=True):
+    ignore_keys = set(
+        ["depthmap", "dataset", "label", "instance", "idx", "true_shape", "rng"]
+    )
+    for view in groups:
+        for name in view.keys():  # pseudo_focal
+            if name in ignore_keys:
+                continue
+            if isinstance(view[name], tuple) or isinstance(view[name], list):
+                view[name] = [x.to(device, non_blocking=True) for x in view[name]]
+            else:
+                view[name] = view[name].to(device, non_blocking=True)
+
+    if verbose:
+        print(f">> Inference with model on {len(groups)} image/raymaps")
+
+    res, state_args = loss_of_one_batch(groups, model, None, None, inference=True)
+    result = to_cpu(res)
+    return result, state_args
+
+
+@torch.no_grad()
+def inference_step(view, state_args, model, device, verbose=True):
+    ignore_keys = set(
+        ["depthmap", "dataset", "label", "instance", "idx", "true_shape", "rng"]
+    )
+    for name in view.keys():  # pseudo_focal
+        if name in ignore_keys:
+            continue
+        if isinstance(view[name], tuple) or isinstance(view[name], list):
+            view[name] = [x.to(device, non_blocking=True) for x in view[name]]
+        else:
+            view[name] = view[name].to(device, non_blocking=True)
+
+    with torch.cuda.amp.autocast(enabled=False):
+        state_feat, state_pos, init_state_feat, mem, init_mem = state_args
+        pred, _ = model.inference_step(
+            view, state_feat, state_pos, init_state_feat, mem, init_mem
+        )
+
+    res = dict(pred=pred)
+    result = to_cpu(res)
+    return result
+
+
+@torch.no_grad()
+def inference_recurrent(groups, model, device, verbose=True):
+    ignore_keys = set(
+        ["depthmap", "dataset", "label", "instance", "idx", "true_shape", "rng"]
+    )
+    for view in groups:
+        for name in view.keys():  # pseudo_focal
+            if name in ignore_keys:
+                continue
+            if isinstance(view[name], tuple) or isinstance(view[name], list):
+                view[name] = [x.to(device, non_blocking=True) for x in view[name]]
+            else:
+                view[name] = view[name].to(device, non_blocking=True)
+
+    if verbose:
+        print(f">> Inference with model on {len(groups)} image/raymaps")
+
+    with torch.cuda.amp.autocast(enabled=False):
+        preds, batch, state_args = model.forward_recurrent(
+            groups, device, ret_state=True
+        )
+        res = dict(views=batch, pred=preds)
+    result = to_cpu(res)
+    return result, state_args
+
+
+def check_if_same_size(pairs):
+    shapes1 = [img1["img"].shape[-2:] for img1, img2 in pairs]
+    shapes2 = [img2["img"].shape[-2:] for img1, img2 in pairs]
+    return all(shapes1[0] == s for s in shapes1) and all(
+        shapes2[0] == s for s in shapes2
+    )
+
+
+def get_pred_pts3d(gt, pred, use_pose=False, inplace=False):
+    if "depth" in pred and "pseudo_focal" in pred:
+        try:
+            pp = gt["camera_intrinsics"][..., :2, 2]
+        except KeyError:
+            pp = None
+        pts3d = depthmap_to_pts3d(**pred, pp=pp)
+
+    elif "pts3d" in pred:
+
+        pts3d = pred["pts3d"]
+
+    elif "pts3d_in_other_view" in pred:
+
+        assert use_pose is True
+        return (
+            pred["pts3d_in_other_view"]
+            if inplace
+            else pred["pts3d_in_other_view"].clone()
+        )
+
+    if use_pose:
+        camera_pose = pred.get("camera_pose")
+        assert camera_pose is not None
+        pts3d = geotrf(camera_pose, pts3d)
+
+    return pts3d
+
+
+def find_opt_scaling(
+    gt_pts1,
+    gt_pts2,
+    pr_pts1,
+    pr_pts2=None,
+    fit_mode="weiszfeld_stop_grad",
+    valid1=None,
+    valid2=None,
+):
+    assert gt_pts1.ndim == pr_pts1.ndim == 4
+    assert gt_pts1.shape == pr_pts1.shape
+    if gt_pts2 is not None:
+        assert gt_pts2.ndim == pr_pts2.ndim == 4
+        assert gt_pts2.shape == pr_pts2.shape
+
+    nan_gt_pts1 = invalid_to_nans(gt_pts1, valid1).flatten(1, 2)
+    nan_gt_pts2 = (
+        invalid_to_nans(gt_pts2, valid2).flatten(1, 2) if gt_pts2 is not None else None
+    )
+
+    pr_pts1 = invalid_to_nans(pr_pts1, valid1).flatten(1, 2)
+    pr_pts2 = (
+        invalid_to_nans(pr_pts2, valid2).flatten(1, 2) if pr_pts2 is not None else None
+    )
+
+    all_gt = (
+        torch.cat((nan_gt_pts1, nan_gt_pts2), dim=1)
+        if gt_pts2 is not None
+        else nan_gt_pts1
+    )
+    all_pr = torch.cat((pr_pts1, pr_pts2), dim=1) if pr_pts2 is not None else pr_pts1
+
+    dot_gt_pr = (all_pr * all_gt).sum(dim=-1)
+    dot_gt_gt = all_gt.square().sum(dim=-1)
+
+    if fit_mode.startswith("avg"):
+
+        scaling = dot_gt_pr.nanmean(dim=1) / dot_gt_gt.nanmean(dim=1)
+    elif fit_mode.startswith("median"):
+        scaling = (dot_gt_pr / dot_gt_gt).nanmedian(dim=1).values
+    elif fit_mode.startswith("weiszfeld"):
+
+        scaling = dot_gt_pr.nanmean(dim=1) / dot_gt_gt.nanmean(dim=1)
+
+        for iter in range(10):
+
+            dis = (all_pr - scaling.view(-1, 1, 1) * all_gt).norm(dim=-1)
+
+            w = dis.clip_(min=1e-8).reciprocal()
+
+            scaling = (w * dot_gt_pr).nanmean(dim=1) / (w * dot_gt_gt).nanmean(dim=1)
+    else:
+        raise ValueError(f"bad {fit_mode=}")
+
+    if fit_mode.endswith("stop_grad"):
+        scaling = scaling.detach()
+
+    scaling = scaling.clip(min=1e-3)
+
+    return scaling
diff --git a/extern/CUT3R/src/dust3r/losses.py b/extern/CUT3R/src/dust3r/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..97310ed6d77b755110a6524a22dc4445d61503e7
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/losses.py
@@ -0,0 +1,1184 @@
+from copy import copy, deepcopy
+import torch
+import torch.nn as nn
+
+from dust3r.inference import get_pred_pts3d, find_opt_scaling
+from dust3r.utils.geometry import (
+    inv,
+    geotrf,
+    normalize_pointcloud,
+    normalize_pointcloud_group,
+)
+from dust3r.utils.geometry import (
+    get_group_pointcloud_depth,
+    get_group_pointcloud_center_scale,
+    weighted_procrustes,
+)
+# from gsplat import rasterization
+import numpy as np
+import lpips
+from dust3r.utils.camera import (
+    pose_encoding_to_camera,
+    camera_to_pose_encoding,
+    relative_pose_absT_quatR,
+)
+
+
+def Sum(*losses_and_masks):
+    loss, mask = losses_and_masks[0]
+    if loss.ndim > 0:
+        # we are actually returning the loss for every pixels
+        return losses_and_masks
+    else:
+        # we are returning the global loss
+        for loss2, mask2 in losses_and_masks[1:]:
+            loss = loss + loss2
+        return loss
+
+
+class BaseCriterion(nn.Module):
+    def __init__(self, reduction="mean"):
+        super().__init__()
+        self.reduction = reduction
+
+
+class LLoss(BaseCriterion):
+    """L-norm loss"""
+
+    def forward(self, a, b):
+        assert (
+            a.shape == b.shape and a.ndim >= 2 and 1 <= a.shape[-1] <= 3
+        ), f"Bad shape = {a.shape}"
+        dist = self.distance(a, b)
+        if self.reduction == "none":
+            return dist
+        if self.reduction == "sum":
+            return dist.sum()
+        if self.reduction == "mean":
+            return dist.mean() if dist.numel() > 0 else dist.new_zeros(())
+        raise ValueError(f"bad {self.reduction=} mode")
+
+    def distance(self, a, b):
+        raise NotImplementedError()
+
+
+class L21Loss(LLoss):
+    """Euclidean distance between 3d points"""
+
+    def distance(self, a, b):
+        return torch.norm(a - b, dim=-1)  # normalized L2 distance
+
+
+L21 = L21Loss()
+
+
+class MSELoss(LLoss):
+    def distance(self, a, b):
+        return (a - b) ** 2
+
+
+MSE = MSELoss()
+
+
+class Criterion(nn.Module):
+    def __init__(self, criterion=None):
+        super().__init__()
+        assert isinstance(
+            criterion, BaseCriterion
+        ), f"{criterion} is not a proper criterion!"
+        self.criterion = copy(criterion)
+
+    def get_name(self):
+        return f"{type(self).__name__}({self.criterion})"
+
+    def with_reduction(self, mode="none"):
+        res = loss = deepcopy(self)
+        while loss is not None:
+            assert isinstance(loss, Criterion)
+            loss.criterion.reduction = mode  # make it return the loss for each sample
+            loss = loss._loss2  # we assume loss is a Multiloss
+        return res
+
+
+class MultiLoss(nn.Module):
+    """Easily combinable losses (also keep track of individual loss values):
+        loss = MyLoss1() + 0.1*MyLoss2()
+    Usage:
+        Inherit from this class and override get_name() and compute_loss()
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._alpha = 1
+        self._loss2 = None
+
+    def compute_loss(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    def get_name(self):
+        raise NotImplementedError()
+
+    def __mul__(self, alpha):
+        assert isinstance(alpha, (int, float))
+        res = copy(self)
+        res._alpha = alpha
+        return res
+
+    __rmul__ = __mul__  # same
+
+    def __add__(self, loss2):
+        assert isinstance(loss2, MultiLoss)
+        res = cur = copy(self)
+        # find the end of the chain
+        while cur._loss2 is not None:
+            cur = cur._loss2
+        cur._loss2 = loss2
+        return res
+
+    def __repr__(self):
+        name = self.get_name()
+        if self._alpha != 1:
+            name = f"{self._alpha:g}*{name}"
+        if self._loss2:
+            name = f"{name} + {self._loss2}"
+        return name
+
+    def forward(self, *args, **kwargs):
+        loss = self.compute_loss(*args, **kwargs)
+        if isinstance(loss, tuple):
+            loss, details = loss
+        elif loss.ndim == 0:
+            details = {self.get_name(): float(loss)}
+        else:
+            details = {}
+        loss = loss * self._alpha
+
+        if self._loss2:
+            loss2, details2 = self._loss2(*args, **kwargs)
+            loss = loss + loss2
+            details |= details2
+
+        return loss, details
+
+
+class SSIM(nn.Module):
+    """Layer to compute the SSIM loss between a pair of images"""
+
+    def __init__(self):
+        super(SSIM, self).__init__()
+        self.mu_x_pool = nn.AvgPool2d(3, 1)
+        self.mu_y_pool = nn.AvgPool2d(3, 1)
+        self.sig_x_pool = nn.AvgPool2d(3, 1)
+        self.sig_y_pool = nn.AvgPool2d(3, 1)
+        self.sig_xy_pool = nn.AvgPool2d(3, 1)
+
+        self.refl = nn.ReflectionPad2d(1)
+
+        self.C1 = 0.01**2
+        self.C2 = 0.03**2
+
+    def forward(self, x, y):
+        x = self.refl(x)
+        y = self.refl(y)
+
+        mu_x = self.mu_x_pool(x)
+        mu_y = self.mu_y_pool(y)
+
+        sigma_x = self.sig_x_pool(x**2) - mu_x**2
+        sigma_y = self.sig_y_pool(y**2) - mu_y**2
+        sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y
+
+        SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2)
+        SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2)
+
+        return torch.clamp((1 - SSIM_n / SSIM_d) / 2, 0, 1)
+
+
+class RGBLoss(Criterion, MultiLoss):
+    def __init__(self, criterion):
+        super().__init__(criterion)
+        self.ssim = SSIM()
+
+    def img_loss(self, a, b):
+        return self.criterion(a, b)
+
+    def compute_loss(self, gts, preds, **kw):
+        gt_rgbs = [gt["img"].permute(0, 2, 3, 1) for gt in gts]
+        pred_rgbs = [pred["rgb"] for pred in preds]
+        ls = [
+            self.img_loss(pred_rgb, gt_rgb)
+            for pred_rgb, gt_rgb in zip(pred_rgbs, gt_rgbs)
+        ]
+        details = {}
+        self_name = type(self).__name__
+        for i, l in enumerate(ls):
+            details[self_name + f"_rgb/{i+1}"] = float(l)
+            details[f"pred_rgb_{i+1}"] = pred_rgbs[i]
+        rgb_loss = sum(ls) / len(ls)
+        return rgb_loss, details
+
+
+class DepthScaleShiftInvLoss(BaseCriterion):
+    """scale and shift invariant loss"""
+
+    def __init__(self, reduction="none"):
+        super().__init__(reduction)
+
+    def forward(self, pred, gt, mask):
+        assert pred.shape == gt.shape and pred.ndim == 3, f"Bad shape = {pred.shape}"
+        dist = self.distance(pred, gt, mask)
+        # assert dist.ndim == a.ndim - 1  # one dimension less
+        if self.reduction == "none":
+            return dist
+        if self.reduction == "sum":
+            return dist.sum()
+        if self.reduction == "mean":
+            return dist.mean() if dist.numel() > 0 else dist.new_zeros(())
+        raise ValueError(f"bad {self.reduction=} mode")
+
+    def normalize(self, x, mask):
+        x_valid = x[mask]
+        splits = mask.sum(dim=(1, 2)).tolist()
+        x_valid_list = torch.split(x_valid, splits)
+        shift = [x.mean() for x in x_valid_list]
+        x_valid_centered = [x - m for x, m in zip(x_valid_list, shift)]
+        scale = [x.abs().mean() for x in x_valid_centered]
+        scale = torch.stack(scale)
+        shift = torch.stack(shift)
+        x = (x - shift.view(-1, 1, 1)) / scale.view(-1, 1, 1).clamp(min=1e-6)
+        return x
+
+    def distance(self, pred, gt, mask):
+        pred = self.normalize(pred, mask)
+        gt = self.normalize(gt, mask)
+        return torch.abs((pred - gt)[mask])
+
+
+class ScaleInvLoss(BaseCriterion):
+    """scale invariant loss"""
+
+    def __init__(self, reduction="none"):
+        super().__init__(reduction)
+
+    def forward(self, pred, gt, mask):
+        assert pred.shape == gt.shape and pred.ndim == 4, f"Bad shape = {pred.shape}"
+        dist = self.distance(pred, gt, mask)
+        # assert dist.ndim == a.ndim - 1  # one dimension less
+        if self.reduction == "none":
+            return dist
+        if self.reduction == "sum":
+            return dist.sum()
+        if self.reduction == "mean":
+            return dist.mean() if dist.numel() > 0 else dist.new_zeros(())
+        raise ValueError(f"bad {self.reduction=} mode")
+
+    def distance(self, pred, gt, mask):
+        pred_norm_factor = (torch.norm(pred, dim=-1) * mask).sum(dim=(1, 2)) / mask.sum(
+            dim=(1, 2)
+        ).clamp(min=1e-6)
+        gt_norm_factor = (torch.norm(gt, dim=-1) * mask).sum(dim=(1, 2)) / mask.sum(
+            dim=(1, 2)
+        ).clamp(min=1e-6)
+        pred = pred / pred_norm_factor.view(-1, 1, 1, 1).clamp(min=1e-6)
+        gt = gt / gt_norm_factor.view(-1, 1, 1, 1).clamp(min=1e-6)
+        return torch.norm(pred - gt, dim=-1)[mask]
+
+
+class Regr3DPose(Criterion, MultiLoss):
+    """Ensure that all 3D points are correct.
+    Asymmetric loss: view1 is supposed to be the anchor.
+
+    P1 = RT1 @ D1
+    P2 = RT2 @ D2
+    loss1 = (I @ pred_D1) - (RT1^-1 @ RT1 @ D1)
+    loss2 = (RT21 @ pred_D2) - (RT1^-1 @ P2)
+          = (RT21 @ pred_D2) - (RT1^-1 @ RT2 @ D2)
+    """
+
+    def __init__(
+        self,
+        criterion,
+        norm_mode="?avg_dis",
+        gt_scale=False,
+        sky_loss_value=2,
+        max_metric_scale=False,
+    ):
+        super().__init__(criterion)
+        if norm_mode.startswith("?"):
+            # do no norm pts from metric scale datasets
+            self.norm_all = False
+            self.norm_mode = norm_mode[1:]
+        else:
+            self.norm_all = True
+            self.norm_mode = norm_mode
+        self.gt_scale = gt_scale
+
+        self.sky_loss_value = sky_loss_value
+        self.max_metric_scale = max_metric_scale
+
+    def get_norm_factor_point_cloud(
+        self, pts_self, pts_cross, valids, conf_self, conf_cross, norm_self_only=False
+    ):
+        if norm_self_only:
+            norm_factor = normalize_pointcloud_group(
+                pts_self, self.norm_mode, valids, conf_self, ret_factor_only=True
+            )
+        else:
+            pts = [torch.cat([x, y], dim=2) for x, y in zip(pts_self, pts_cross)]
+            valids = [torch.cat([x, x], dim=2) for x in valids]
+            confs = [torch.cat([x, y], dim=2) for x, y in zip(conf_self, conf_cross)]
+            norm_factor = normalize_pointcloud_group(
+                pts, self.norm_mode, valids, confs, ret_factor_only=True
+            )
+        return norm_factor
+
+    def get_norm_factor_poses(self, gt_trans, pr_trans, not_metric_mask):
+
+        if self.norm_mode and not self.gt_scale:
+            gt_trans = [x[:, None, None, :].clone() for x in gt_trans]
+            valids = [torch.ones_like(x[..., 0], dtype=torch.bool) for x in gt_trans]
+            norm_factor_gt = (
+                normalize_pointcloud_group(
+                    gt_trans,
+                    self.norm_mode,
+                    valids,
+                    ret_factor_only=True,
+                )
+                .squeeze(-1)
+                .squeeze(-1)
+            )
+        else:
+            norm_factor_gt = torch.ones(
+                len(gt_trans), dtype=gt_trans[0].dtype, device=gt_trans[0].device
+            )
+
+        norm_factor_pr = norm_factor_gt.clone()
+        if self.norm_mode and not_metric_mask.sum() > 0 and not self.gt_scale:
+            pr_trans_not_metric = [
+                x[not_metric_mask][:, None, None, :].clone() for x in pr_trans
+            ]
+            valids = [
+                torch.ones_like(x[..., 0], dtype=torch.bool)
+                for x in pr_trans_not_metric
+            ]
+            norm_factor_pr_not_metric = (
+                normalize_pointcloud_group(
+                    pr_trans_not_metric,
+                    self.norm_mode,
+                    valids,
+                    ret_factor_only=True,
+                )
+                .squeeze(-1)
+                .squeeze(-1)
+            )
+            norm_factor_pr[not_metric_mask] = norm_factor_pr_not_metric
+        return norm_factor_gt, norm_factor_pr
+
+    def get_all_pts3d(
+        self,
+        gts,
+        preds,
+        dist_clip=None,
+        norm_self_only=False,
+        norm_pose_separately=False,
+        eps=1e-3,
+        camera1=None,
+    ):
+        # everything is normalized w.r.t. camera of view1
+        in_camera1 = inv(gts[0]["camera_pose"]) if camera1 is None else inv(camera1)
+        gt_pts_self = [geotrf(inv(gt["camera_pose"]), gt["pts3d"]) for gt in gts]
+        gt_pts_cross = [geotrf(in_camera1, gt["pts3d"]) for gt in gts]
+        valids = [gt["valid_mask"].clone() for gt in gts]
+        camera_only = gts[0]["camera_only"]
+
+        if dist_clip is not None:
+            # points that are too far-away == invalid
+            dis = [gt_pt.norm(dim=-1) for gt_pt in gt_pts_cross]
+            valids = [valid & (dis <= dist_clip) for valid, dis in zip(valids, dis)]
+
+        pr_pts_self = [pred["pts3d_in_self_view"] for pred in preds]
+        pr_pts_cross = [pred["pts3d_in_other_view"] for pred in preds]
+        conf_self = [torch.log(pred["conf_self"]).detach().clip(eps) for pred in preds]
+        conf_cross = [torch.log(pred["conf"]).detach().clip(eps) for pred in preds]
+
+        if not self.norm_all:
+            if self.max_metric_scale:
+                B = valids[0].shape[0]
+                dist = [
+                    torch.where(valid, torch.linalg.norm(gt_pt_cross, dim=-1), 0).view(
+                        B, -1
+                    )
+                    for valid, gt_pt_cross in zip(valids, gt_pts_cross)
+                ]
+                for d in dist:
+                    gts[0]["is_metric"] = gts[0]["is_metric_scale"] & (
+                        d.max(dim=-1).values < self.max_metric_scale
+                    )
+            not_metric_mask = ~gts[0]["is_metric"]
+        else:
+            not_metric_mask = torch.ones_like(gts[0]["is_metric"])
+
+        # normalize 3d points
+        # compute the scale using only the self view point maps
+        if self.norm_mode and not self.gt_scale:
+            norm_factor_gt = self.get_norm_factor_point_cloud(
+                gt_pts_self,
+                gt_pts_cross,
+                valids,
+                conf_self,
+                conf_cross,
+                norm_self_only=norm_self_only,
+            )
+        else:
+            norm_factor_gt = torch.ones_like(
+                preds[0]["pts3d_in_other_view"][:, :1, :1, :1]
+            )
+
+        norm_factor_pr = norm_factor_gt.clone()
+        if self.norm_mode and not_metric_mask.sum() > 0 and not self.gt_scale:
+            norm_factor_pr_not_metric = self.get_norm_factor_point_cloud(
+                [pr_pt_self[not_metric_mask] for pr_pt_self in pr_pts_self],
+                [pr_pt_cross[not_metric_mask] for pr_pt_cross in pr_pts_cross],
+                [valid[not_metric_mask] for valid in valids],
+                [conf[not_metric_mask] for conf in conf_self],
+                [conf[not_metric_mask] for conf in conf_cross],
+                norm_self_only=norm_self_only,
+            )
+            norm_factor_pr[not_metric_mask] = norm_factor_pr_not_metric
+
+        norm_factor_gt = norm_factor_gt.clip(eps)
+        norm_factor_pr = norm_factor_pr.clip(eps)
+
+        gt_pts_self = [pts / norm_factor_gt for pts in gt_pts_self]
+        gt_pts_cross = [pts / norm_factor_gt for pts in gt_pts_cross]
+        pr_pts_self = [pts / norm_factor_pr for pts in pr_pts_self]
+        pr_pts_cross = [pts / norm_factor_pr for pts in pr_pts_cross]
+
+        # [(Bx3, BX4), (BX3, BX4), ...], 3 for translation, 4 for quaternion
+        gt_poses = [
+            camera_to_pose_encoding(in_camera1 @ gt["camera_pose"]).clone()
+            for gt in gts
+        ]
+        pr_poses = [pred["camera_pose"].clone() for pred in preds]
+        pose_norm_factor_gt = norm_factor_gt.clone().squeeze(2, 3)
+        pose_norm_factor_pr = norm_factor_pr.clone().squeeze(2, 3)
+
+        if norm_pose_separately:
+            gt_trans = [gt[:, :3] for gt in gt_poses]
+            pr_trans = [pr[:, :3] for pr in pr_poses]
+            pose_norm_factor_gt, pose_norm_factor_pr = self.get_norm_factor_poses(
+                gt_trans, pr_trans, not_metric_mask
+            )
+        elif any(camera_only):
+            gt_trans = [gt[:, :3] for gt in gt_poses]
+            pr_trans = [pr[:, :3] for pr in pr_poses]
+            pose_only_norm_factor_gt, pose_only_norm_factor_pr = (
+                self.get_norm_factor_poses(gt_trans, pr_trans, not_metric_mask)
+            )
+            pose_norm_factor_gt = torch.where(
+                camera_only[:, None], pose_only_norm_factor_gt, pose_norm_factor_gt
+            )
+            pose_norm_factor_pr = torch.where(
+                camera_only[:, None], pose_only_norm_factor_pr, pose_norm_factor_pr
+            )
+
+        gt_poses = [
+            (gt[:, :3] / pose_norm_factor_gt.clip(eps), gt[:, 3:]) for gt in gt_poses
+        ]
+        pr_poses = [
+            (pr[:, :3] / pose_norm_factor_pr.clip(eps), pr[:, 3:]) for pr in pr_poses
+        ]
+        pose_masks = (pose_norm_factor_gt.squeeze() > eps) & (
+            pose_norm_factor_pr.squeeze() > eps
+        )
+
+        if any(camera_only):
+            # this is equal to a loss for camera intrinsics
+            gt_pts_self = [
+                torch.where(
+                    camera_only[:, None, None, None],
+                    (gt / gt[..., -1:].clip(1e-6)).clip(-2, 2),
+                    gt,
+                )
+                for gt in gt_pts_self
+            ]
+            pr_pts_self = [
+                torch.where(
+                    camera_only[:, None, None, None],
+                    (pr / pr[..., -1:].clip(1e-6)).clip(-2, 2),
+                    pr,
+                )
+                for pr in pr_pts_self
+            ]
+            # # do not add cross view loss when there is only camera supervision
+
+        skys = [gt["sky_mask"] & ~valid for gt, valid in zip(gts, valids)]
+        return (
+            gt_pts_self,
+            gt_pts_cross,
+            pr_pts_self,
+            pr_pts_cross,
+            gt_poses,
+            pr_poses,
+            valids,
+            skys,
+            pose_masks,
+            {},
+        )
+
+    def get_all_pts3d_with_scale_loss(
+        self,
+        gts,
+        preds,
+        dist_clip=None,
+        norm_self_only=False,
+        norm_pose_separately=False,
+        eps=1e-3,
+    ):
+        # everything is normalized w.r.t. camera of view1
+        in_camera1 = inv(gts[0]["camera_pose"])
+        gt_pts_self = [geotrf(inv(gt["camera_pose"]), gt["pts3d"]) for gt in gts]
+        gt_pts_cross = [geotrf(in_camera1, gt["pts3d"]) for gt in gts]
+        valids = [gt["valid_mask"].clone() for gt in gts]
+        camera_only = gts[0]["camera_only"]
+
+        if dist_clip is not None:
+            # points that are too far-away == invalid
+            dis = [gt_pt.norm(dim=-1) for gt_pt in gt_pts_cross]
+            valids = [valid & (dis <= dist_clip) for valid, dis in zip(valids, dis)]
+
+        pr_pts_self = [pred["pts3d_in_self_view"] for pred in preds]
+        pr_pts_cross = [pred["pts3d_in_other_view"] for pred in preds]
+        conf_self = [torch.log(pred["conf_self"]).detach().clip(eps) for pred in preds]
+        conf_cross = [torch.log(pred["conf"]).detach().clip(eps) for pred in preds]
+
+        if not self.norm_all:
+            if self.max_metric_scale:
+                B = valids[0].shape[0]
+                dist = [
+                    torch.where(valid, torch.linalg.norm(gt_pt_cross, dim=-1), 0).view(
+                        B, -1
+                    )
+                    for valid, gt_pt_cross in zip(valids, gt_pts_cross)
+                ]
+                for d in dist:
+                    gts[0]["is_metric"] = gts[0]["is_metric_scale"] & (
+                        d.max(dim=-1).values < self.max_metric_scale
+                    )
+            not_metric_mask = ~gts[0]["is_metric"]
+        else:
+            not_metric_mask = torch.ones_like(gts[0]["is_metric"])
+
+        # normalize 3d points
+        # compute the scale using only the self view point maps
+        if self.norm_mode and not self.gt_scale:
+            norm_factor_gt = self.get_norm_factor_point_cloud(
+                gt_pts_self[:1],
+                gt_pts_cross[:1],
+                valids[:1],
+                conf_self[:1],
+                conf_cross[:1],
+                norm_self_only=norm_self_only,
+            )
+        else:
+            norm_factor_gt = torch.ones_like(
+                preds[0]["pts3d_in_other_view"][:, :1, :1, :1]
+            )
+
+        if self.norm_mode:
+            norm_factor_pr = self.get_norm_factor_point_cloud(
+                pr_pts_self[:1],
+                pr_pts_cross[:1],
+                valids[:1],
+                conf_self[:1],
+                conf_cross[:1],
+                norm_self_only=norm_self_only,
+            )
+        else:
+            raise NotImplementedError
+        # only add loss to metric scale norm factor
+        if (~not_metric_mask).sum() > 0:
+            pts_scale_loss = torch.abs(
+                norm_factor_pr[~not_metric_mask] - norm_factor_gt[~not_metric_mask]
+            ).mean()
+        else:
+            pts_scale_loss = 0.0
+
+        norm_factor_gt = norm_factor_gt.clip(eps)
+        norm_factor_pr = norm_factor_pr.clip(eps)
+
+        gt_pts_self = [pts / norm_factor_gt for pts in gt_pts_self]
+        gt_pts_cross = [pts / norm_factor_gt for pts in gt_pts_cross]
+        pr_pts_self = [pts / norm_factor_pr for pts in pr_pts_self]
+        pr_pts_cross = [pts / norm_factor_pr for pts in pr_pts_cross]
+
+        # [(Bx3, BX4), (BX3, BX4), ...], 3 for translation, 4 for quaternion
+        gt_poses = [
+            camera_to_pose_encoding(in_camera1 @ gt["camera_pose"]).clone()
+            for gt in gts
+        ]
+        pr_poses = [pred["camera_pose"].clone() for pred in preds]
+        pose_norm_factor_gt = norm_factor_gt.clone().squeeze(2, 3)
+        pose_norm_factor_pr = norm_factor_pr.clone().squeeze(2, 3)
+
+        if norm_pose_separately:
+            gt_trans = [gt[:, :3] for gt in gt_poses][:1]
+            pr_trans = [pr[:, :3] for pr in pr_poses][:1]
+            pose_norm_factor_gt, pose_norm_factor_pr = self.get_norm_factor_poses(
+                gt_trans, pr_trans, torch.ones_like(not_metric_mask)
+            )
+        elif any(camera_only):
+            gt_trans = [gt[:, :3] for gt in gt_poses][:1]
+            pr_trans = [pr[:, :3] for pr in pr_poses][:1]
+            pose_only_norm_factor_gt, pose_only_norm_factor_pr = (
+                self.get_norm_factor_poses(
+                    gt_trans, pr_trans, torch.ones_like(not_metric_mask)
+                )
+            )
+            pose_norm_factor_gt = torch.where(
+                camera_only[:, None], pose_only_norm_factor_gt, pose_norm_factor_gt
+            )
+            pose_norm_factor_pr = torch.where(
+                camera_only[:, None], pose_only_norm_factor_pr, pose_norm_factor_pr
+            )
+        # only add loss to metric scale norm factor
+        if (~not_metric_mask).sum() > 0:
+            pose_scale_loss = torch.abs(
+                pose_norm_factor_pr[~not_metric_mask]
+                - pose_norm_factor_gt[~not_metric_mask]
+            ).mean()
+        else:
+            pose_scale_loss = 0.0
+        gt_poses = [
+            (gt[:, :3] / pose_norm_factor_gt.clip(eps), gt[:, 3:]) for gt in gt_poses
+        ]
+        pr_poses = [
+            (pr[:, :3] / pose_norm_factor_pr.clip(eps), pr[:, 3:]) for pr in pr_poses
+        ]
+
+        pose_masks = (pose_norm_factor_gt.squeeze() > eps) & (
+            pose_norm_factor_pr.squeeze() > eps
+        )
+
+        if any(camera_only):
+            # this is equal to a loss for camera intrinsics
+            gt_pts_self = [
+                torch.where(
+                    camera_only[:, None, None, None],
+                    (gt / gt[..., -1:].clip(1e-6)).clip(-2, 2),
+                    gt,
+                )
+                for gt in gt_pts_self
+            ]
+            pr_pts_self = [
+                torch.where(
+                    camera_only[:, None, None, None],
+                    (pr / pr[..., -1:].clip(1e-6)).clip(-2, 2),
+                    pr,
+                )
+                for pr in pr_pts_self
+            ]
+            # # do not add cross view loss when there is only camera supervision
+
+        skys = [gt["sky_mask"] & ~valid for gt, valid in zip(gts, valids)]
+        return (
+            gt_pts_self,
+            gt_pts_cross,
+            pr_pts_self,
+            pr_pts_cross,
+            gt_poses,
+            pr_poses,
+            valids,
+            skys,
+            pose_masks,
+            {"scale_loss": pose_scale_loss + pts_scale_loss},
+        )
+
+    def compute_relative_pose_loss(
+        self, gt_trans, gt_quats, pr_trans, pr_quats, masks=None
+    ):
+        if masks is None:
+            masks = torch.ones(len(gt_trans), dtype=torch.bool, device=gt_trans.device)
+        gt_trans_matrix1 = gt_trans[:, :, None, :].repeat(1, 1, gt_trans.shape[1], 1)[
+            masks
+        ]
+        gt_trans_matrix2 = gt_trans[:, None, :, :].repeat(1, gt_trans.shape[1], 1, 1)[
+            masks
+        ]
+        gt_quats_matrix1 = gt_quats[:, :, None, :].repeat(1, 1, gt_quats.shape[1], 1)[
+            masks
+        ]
+        gt_quats_matrix2 = gt_quats[:, None, :, :].repeat(1, gt_quats.shape[1], 1, 1)[
+            masks
+        ]
+        pr_trans_matrix1 = pr_trans[:, :, None, :].repeat(1, 1, pr_trans.shape[1], 1)[
+            masks
+        ]
+        pr_trans_matrix2 = pr_trans[:, None, :, :].repeat(1, pr_trans.shape[1], 1, 1)[
+            masks
+        ]
+        pr_quats_matrix1 = pr_quats[:, :, None, :].repeat(1, 1, pr_quats.shape[1], 1)[
+            masks
+        ]
+        pr_quats_matrix2 = pr_quats[:, None, :, :].repeat(1, pr_quats.shape[1], 1, 1)[
+            masks
+        ]
+
+        gt_rel_trans, gt_rel_quats = relative_pose_absT_quatR(
+            gt_trans_matrix1, gt_quats_matrix1, gt_trans_matrix2, gt_quats_matrix2
+        )
+        pr_rel_trans, pr_rel_quats = relative_pose_absT_quatR(
+            pr_trans_matrix1, pr_quats_matrix1, pr_trans_matrix2, pr_quats_matrix2
+        )
+        rel_trans_err = torch.norm(gt_rel_trans - pr_rel_trans, dim=-1)
+        rel_quats_err = torch.norm(gt_rel_quats - pr_rel_quats, dim=-1)
+        return rel_trans_err.mean() + rel_quats_err.mean()
+
+    def compute_pose_loss(self, gt_poses, pred_poses, masks=None):
+        """
+        gt_pose: list of (Bx3, Bx4)
+        pred_pose: list of (Bx3, Bx4)
+        masks: None, or B
+        """
+        gt_trans = torch.stack([gt[0] for gt in gt_poses], dim=1)  # BxNx3
+        gt_quats = torch.stack([gt[1] for gt in gt_poses], dim=1)  # BXNX3
+        pred_trans = torch.stack([pr[0] for pr in pred_poses], dim=1)  # BxNx4
+        pred_quats = torch.stack([pr[1] for pr in pred_poses], dim=1)  # BxNx4
+        if masks == None:
+            pose_loss = (
+                torch.norm(pred_trans - gt_trans, dim=-1).mean()
+                + torch.norm(pred_quats - gt_quats, dim=-1).mean()
+            )
+        else:
+            if not any(masks):
+                return torch.tensor(0.0)
+            pose_loss = (
+                torch.norm(pred_trans - gt_trans, dim=-1)[masks].mean()
+                + torch.norm(pred_quats - gt_quats, dim=-1)[masks].mean()
+            )
+
+        return pose_loss
+
+    def compute_loss(self, gts, preds, **kw):
+        (
+            gt_pts_self,
+            gt_pts_cross,
+            pred_pts_self,
+            pred_pts_cross,
+            gt_poses,
+            pr_poses,
+            masks,
+            skys,
+            pose_masks,
+            monitoring,
+        ) = self.get_all_pts3d(gts, preds, **kw)
+
+        if self.sky_loss_value > 0:
+            assert (
+                self.criterion.reduction == "none"
+            ), "sky_loss_value should be 0 if no conf loss"
+            masks = [mask | sky for mask, sky in zip(masks, skys)]
+
+        # self view loss and details
+        if "Quantile" in self.criterion.__class__.__name__:
+            # masks are overwritten taking into account self view losses
+            ls_self, masks = self.criterion(
+                pred_pts_self, gt_pts_self, masks, gts[0]["quantile"]
+            )
+        else:
+            ls_self = [
+                self.criterion(pred_pt[mask], gt_pt[mask])
+                for pred_pt, gt_pt, mask in zip(pred_pts_self, gt_pts_self, masks)
+            ]
+
+        if self.sky_loss_value > 0:
+            assert (
+                self.criterion.reduction == "none"
+            ), "sky_loss_value should be 0 if no conf loss"
+            for i, l in enumerate(ls_self):
+                ls_self[i] = torch.where(skys[i][masks[i]], self.sky_loss_value, l)
+
+        self_name = type(self).__name__
+
+        details = {}
+        for i in range(len(ls_self)):
+            details[self_name + f"_self_pts3d/{i+1}"] = float(ls_self[i].mean())
+            details[f"gt_img{i+1}"] = gts[i]["img"].permute(0, 2, 3, 1).detach()
+            details[f"self_conf_{i+1}"] = preds[i]["conf_self"].detach()
+            details[f"valid_mask_{i+1}"] = masks[i].detach()
+
+            if "img_mask" in gts[i] and "ray_mask" in gts[i]:
+                details[f"img_mask_{i+1}"] = gts[i]["img_mask"].detach()
+                details[f"ray_mask_{i+1}"] = gts[i]["ray_mask"].detach()
+
+            if "desc" in preds[i]:
+                details[f"desc_{i+1}"] = preds[i]["desc"].detach()
+
+        # cross view loss and details
+        camera_only = gts[0]["camera_only"]
+        pred_pts_cross = [pred_pts[~camera_only] for pred_pts in pred_pts_cross]
+        gt_pts_cross = [gt_pts[~camera_only] for gt_pts in gt_pts_cross]
+        masks_cross = [mask[~camera_only] for mask in masks]
+        skys_cross = [sky[~camera_only] for sky in skys]
+
+        if "Quantile" in self.criterion.__class__.__name__:
+            # quantile masks have already been determined by self view losses, here pass in None as quantile
+            ls_cross, _ = self.criterion(
+                pred_pts_cross, gt_pts_cross, masks_cross, None
+            )
+        else:
+            ls_cross = [
+                self.criterion(pred_pt[mask], gt_pt[mask])
+                for pred_pt, gt_pt, mask in zip(
+                    pred_pts_cross, gt_pts_cross, masks_cross
+                )
+            ]
+
+        if self.sky_loss_value > 0:
+            assert (
+                self.criterion.reduction == "none"
+            ), "sky_loss_value should be 0 if no conf loss"
+            for i, l in enumerate(ls_cross):
+                ls_cross[i] = torch.where(
+                    skys_cross[i][masks_cross[i]], self.sky_loss_value, l
+                )
+
+        for i in range(len(ls_cross)):
+            details[self_name + f"_pts3d/{i+1}"] = float(
+                ls_cross[i].mean() if ls_cross[i].numel() > 0 else 0
+            )
+            details[f"conf_{i+1}"] = preds[i]["conf"].detach()
+
+        ls = ls_self + ls_cross
+        masks = masks + masks_cross
+        details["is_self"] = [True] * len(ls_self) + [False] * len(ls_cross)
+        details["img_ids"] = (
+            np.arange(len(ls_self)).tolist() + np.arange(len(ls_cross)).tolist()
+        )
+        details["pose_loss"] = self.compute_pose_loss(gt_poses, pr_poses, pose_masks)
+
+        return Sum(*list(zip(ls, masks))), (details | monitoring)
+
+
+class Regr3DPoseBatchList(Regr3DPose):
+    """Ensure that all 3D points are correct.
+    Asymmetric loss: view1 is supposed to be the anchor.
+
+    P1 = RT1 @ D1
+    P2 = RT2 @ D2
+    loss1 = (I @ pred_D1) - (RT1^-1 @ RT1 @ D1)
+    loss2 = (RT21 @ pred_D2) - (RT1^-1 @ P2)
+          = (RT21 @ pred_D2) - (RT1^-1 @ RT2 @ D2)
+    """
+
+    def __init__(
+        self,
+        criterion,
+        norm_mode="?avg_dis",
+        gt_scale=False,
+        sky_loss_value=2,
+        max_metric_scale=False,
+    ):
+        super().__init__(
+            criterion, norm_mode, gt_scale, sky_loss_value, max_metric_scale
+        )
+        self.depth_only_criterion = DepthScaleShiftInvLoss()
+        self.single_view_criterion = ScaleInvLoss()
+
+    def reorg(self, ls_b, masks_b):
+        ids_split = [mask.sum(dim=(1, 2)) for mask in masks_b]
+        ls = [[] for _ in range(len(masks_b[0]))]
+        for i in range(len(ls_b)):
+            ls_splitted_i = torch.split(ls_b[i], ids_split[i].tolist())
+            for j in range(len(masks_b[0])):
+                ls[j].append(ls_splitted_i[j])
+        ls = [torch.cat(l) for l in ls]
+        return ls
+
+    def compute_loss(self, gts, preds, **kw):
+        (
+            gt_pts_self,
+            gt_pts_cross,
+            pred_pts_self,
+            pred_pts_cross,
+            gt_poses,
+            pr_poses,
+            masks,
+            skys,
+            pose_masks,
+            monitoring,
+        ) = self.get_all_pts3d(gts, preds, **kw)
+
+        if self.sky_loss_value > 0:
+            assert (
+                self.criterion.reduction == "none"
+            ), "sky_loss_value should be 0 if no conf loss"
+            masks = [mask | sky for mask, sky in zip(masks, skys)]
+
+        camera_only = gts[0]["camera_only"]
+        depth_only = gts[0]["depth_only"]
+        single_view = gts[0]["single_view"]
+        is_metric = gts[0]["is_metric"]
+
+        # self view loss and details
+        if "Quantile" in self.criterion.__class__.__name__:
+            raise NotImplementedError
+        else:
+            # list [(B, h, w, 3)] x num_views -> list [num_views, h, w, 3] x B
+            gt_pts_self_b = torch.unbind(torch.stack(gt_pts_self, dim=1), dim=0)
+            pred_pts_self_b = torch.unbind(torch.stack(pred_pts_self, dim=1), dim=0)
+            masks_b = torch.unbind(torch.stack(masks, dim=1), dim=0)
+            ls_self_b = []
+            for i in range(len(gt_pts_self_b)):
+                if depth_only[
+                    i
+                ]:  # if only have relative depth, no intrinsics or anything
+                    ls_self_b.append(
+                        self.depth_only_criterion(
+                            pred_pts_self_b[i][..., -1],
+                            gt_pts_self_b[i][..., -1],
+                            masks_b[i],
+                        )
+                    )
+                elif (
+                    single_view[i] and not is_metric[i]
+                ):  # if single view, with intrinsics and not metric
+                    ls_self_b.append(
+                        self.single_view_criterion(
+                            pred_pts_self_b[i], gt_pts_self_b[i], masks_b[i]
+                        )
+                    )
+                else:  # if multiple view, or metric single view
+                    ls_self_b.append(
+                        self.criterion(
+                            pred_pts_self_b[i][masks_b[i]], gt_pts_self_b[i][masks_b[i]]
+                        )
+                    )
+            ls_self = self.reorg(ls_self_b, masks_b)
+
+        if self.sky_loss_value > 0:
+            assert (
+                self.criterion.reduction == "none"
+            ), "sky_loss_value should be 0 if no conf loss"
+            for i, l in enumerate(ls_self):
+                ls_self[i] = torch.where(skys[i][masks[i]], self.sky_loss_value, l)
+
+        self_name = type(self).__name__
+
+        details = {}
+        for i in range(len(ls_self)):
+            details[self_name + f"_self_pts3d/{i+1}"] = float(ls_self[i].mean())
+            details[f"self_conf_{i+1}"] = preds[i]["conf_self"].detach()
+            details[f"gt_img{i+1}"] = gts[i]["img"].permute(0, 2, 3, 1).detach()
+            details[f"valid_mask_{i+1}"] = masks[i].detach()
+
+            if "img_mask" in gts[i] and "ray_mask" in gts[i]:
+                details[f"img_mask_{i+1}"] = gts[i]["img_mask"].detach()
+                details[f"ray_mask_{i+1}"] = gts[i]["ray_mask"].detach()
+
+            if "desc" in preds[i]:
+                details[f"desc_{i+1}"] = preds[i]["desc"].detach()
+
+        if "Quantile" in self.criterion.__class__.__name__:
+            # quantile masks have already been determined by self view losses, here pass in None as quantile
+            raise NotImplementedError
+        else:
+            gt_pts_cross_b = torch.unbind(
+                torch.stack(gt_pts_cross, dim=1)[~camera_only], dim=0
+            )
+            pred_pts_cross_b = torch.unbind(
+                torch.stack(pred_pts_cross, dim=1)[~camera_only], dim=0
+            )
+            masks_cross_b = torch.unbind(torch.stack(masks, dim=1)[~camera_only], dim=0)
+            ls_cross_b = []
+            for i in range(len(gt_pts_cross_b)):
+                if depth_only[~camera_only][i]:
+                    ls_cross_b.append(
+                        self.depth_only_criterion(
+                            pred_pts_cross_b[i][..., -1],
+                            gt_pts_cross_b[i][..., -1],
+                            masks_cross_b[i],
+                        )
+                    )
+                elif single_view[~camera_only][i] and not is_metric[~camera_only][i]:
+                    ls_cross_b.append(
+                        self.single_view_criterion(
+                            pred_pts_cross_b[i], gt_pts_cross_b[i], masks_cross_b[i]
+                        )
+                    )
+                else:
+                    ls_cross_b.append(
+                        self.criterion(
+                            pred_pts_cross_b[i][masks_cross_b[i]],
+                            gt_pts_cross_b[i][masks_cross_b[i]],
+                        )
+                    )
+            ls_cross = self.reorg(ls_cross_b, masks_cross_b)
+
+        if self.sky_loss_value > 0:
+            assert (
+                self.criterion.reduction == "none"
+            ), "sky_loss_value should be 0 if no conf loss"
+            masks_cross = [mask[~camera_only] for mask in masks]
+            skys_cross = [sky[~camera_only] for sky in skys]
+            for i, l in enumerate(ls_cross):
+                ls_cross[i] = torch.where(
+                    skys_cross[i][masks_cross[i]], self.sky_loss_value, l
+                )
+
+        for i in range(len(ls_cross)):
+            details[self_name + f"_pts3d/{i+1}"] = float(
+                ls_cross[i].mean() if ls_cross[i].numel() > 0 else 0
+            )
+            details[f"conf_{i+1}"] = preds[i]["conf"].detach()
+
+        ls = ls_self + ls_cross
+        masks = masks + masks_cross
+        details["is_self"] = [True] * len(ls_self) + [False] * len(ls_cross)
+        details["img_ids"] = (
+            np.arange(len(ls_self)).tolist() + np.arange(len(ls_cross)).tolist()
+        )
+        pose_masks = pose_masks * gts[i]["img_mask"]
+        details["pose_loss"] = self.compute_pose_loss(gt_poses, pr_poses, pose_masks)
+
+        return Sum(*list(zip(ls, masks))), (details | monitoring)
+
+
+class ConfLoss(MultiLoss):
+    """Weighted regression by learned confidence.
+        Assuming the input pixel_loss is a pixel-level regression loss.
+
+    Principle:
+        high-confidence means high conf = 0.1 ==> conf_loss = x / 10 + alpha*log(10)
+        low  confidence means low  conf = 10  ==> conf_loss = x * 10 - alpha*log(10)
+
+        alpha: hyperparameter
+    """
+
+    def __init__(self, pixel_loss, alpha=1):
+        super().__init__()
+        assert alpha > 0
+        self.alpha = alpha
+        self.pixel_loss = pixel_loss.with_reduction("none")
+
+    def get_name(self):
+        return f"ConfLoss({self.pixel_loss})"
+
+    def get_conf_log(self, x):
+        return x, torch.log(x)
+
+    def compute_loss(self, gts, preds, **kw):
+        # compute per-pixel loss
+        losses_and_masks, details = self.pixel_loss(gts, preds, **kw)
+        if "is_self" in details and "img_ids" in details:
+            is_self = details["is_self"]
+            img_ids = details["img_ids"]
+        else:
+            is_self = [False] * len(losses_and_masks)
+            img_ids = list(range(len(losses_and_masks)))
+
+        # weight by confidence
+        conf_losses = []
+
+        for i in range(len(losses_and_masks)):
+            pred = preds[img_ids[i]]
+            conf_key = "conf_self" if is_self[i] else "conf"
+            if not is_self[i]:
+                camera_only = gts[0]["camera_only"]
+                conf, log_conf = self.get_conf_log(
+                    pred[conf_key][~camera_only][losses_and_masks[i][1]]
+                )
+            else:
+                conf, log_conf = self.get_conf_log(
+                    pred[conf_key][losses_and_masks[i][1]]
+                )
+
+            conf_loss = losses_and_masks[i][0] * conf - self.alpha * log_conf
+            conf_loss = conf_loss.mean() if conf_loss.numel() > 0 else 0
+            conf_losses.append(conf_loss)
+
+            if is_self[i]:
+                details[self.get_name() + f"_conf_loss_self/{img_ids[i]+1}"] = float(
+                    conf_loss
+                )
+            else:
+                details[self.get_name() + f"_conf_loss/{img_ids[i]+1}"] = float(
+                    conf_loss
+                )
+
+        details.pop("is_self", None)
+        details.pop("img_ids", None)
+
+        final_loss = sum(conf_losses) / len(conf_losses) * 2.0
+        if "pose_loss" in details:
+            final_loss = (
+                final_loss + details["pose_loss"].clip(max=0.3) * 5.0
+            )  # , details
+        if "scale_loss" in details:
+            final_loss = final_loss + details["scale_loss"]
+        return final_loss, details
+
+
+class Regr3DPose_ScaleInv(Regr3DPose):
+    """Same than Regr3D but invariant to depth shift.
+    if gt_scale == True: enforce the prediction to take the same scale than GT
+    """
+
+    def get_all_pts3d(self, gts, preds):
+        # compute depth-normalized points
+        (
+            gt_pts_self,
+            gt_pts_cross,
+            pr_pts_self,
+            pr_pts_cross,
+            gt_poses,
+            pr_poses,
+            masks,
+            skys,
+            pose_masks,
+            monitoring,
+        ) = super().get_all_pts3d(gts, preds)
+
+        # measure scene scale
+        _, gt_scale_self = get_group_pointcloud_center_scale(gt_pts_self, masks)
+        _, pred_scale_self = get_group_pointcloud_center_scale(pr_pts_self, masks)
+
+        _, gt_scale_cross = get_group_pointcloud_center_scale(gt_pts_cross, masks)
+        _, pred_scale_cross = get_group_pointcloud_center_scale(pr_pts_cross, masks)
+
+        # prevent predictions to be in a ridiculous range
+        pred_scale_self = pred_scale_self.clip(min=1e-3, max=1e3)
+        pred_scale_cross = pred_scale_cross.clip(min=1e-3, max=1e3)
+
+        # subtract the median depth
+        if self.gt_scale:
+            pr_pts_self = [
+                pr_pt_self * gt_scale_self / pred_scale_self
+                for pr_pt_self in pr_pts_self
+            ]
+            pr_pts_cross = [
+                pr_pt_cross * gt_scale_cross / pred_scale_cross
+                for pr_pt_cross in pr_pts_cross
+            ]
+        else:
+            gt_pts_self = [gt_pt_self / gt_scale_self for gt_pt_self in gt_pts_self]
+            gt_pts_cross = [
+                gt_pt_cross / gt_scale_cross for gt_pt_cross in gt_pts_cross
+            ]
+            pr_pts_self = [pr_pt_self / pred_scale_self for pr_pt_self in pr_pts_self]
+            pr_pts_cross = [
+                pr_pt_cross / pred_scale_cross for pr_pt_cross in pr_pts_cross
+            ]
+
+        return (
+            gt_pts_self,
+            gt_pts_cross,
+            pr_pts_self,
+            pr_pts_cross,
+            gt_poses,
+            pr_poses,
+            masks,
+            skys,
+            pose_masks,
+            monitoring,
+        )
diff --git a/extern/CUT3R/src/dust3r/model.py b/extern/CUT3R/src/dust3r/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee73844b0f727286b3767ac9063b87079abbfdb1
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/model.py
@@ -0,0 +1,1120 @@
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from copy import deepcopy
+from functools import partial
+from typing import Optional, Tuple, List, Any
+from dataclasses import dataclass
+from transformers import PretrainedConfig
+from transformers.file_utils import ModelOutput
+from dust3r.utils.misc import (
+    fill_default_args,
+    freeze_all_params,
+    is_symmetrized,
+    interleave,
+    transpose_to_landscape,
+)
+from dust3r.heads import head_factory
+from dust3r.utils.camera import PoseEncoder
+from dust3r.patch_embed import get_patch_embed
+import dust3r.utils.path_to_croco  # noqa: F401
+from models.croco import CroCoNet, CrocoConfig  # noqa
+from dust3r.blocks import (
+    Block,
+    DecoderBlock,
+    Mlp,
+    Attention,
+    CrossAttention,
+    DropPath,
+    CustomDecoderBlock,
+)  # noqa
+
+inf = float("inf")
+from accelerate.logging import get_logger
+
+printer = get_logger(__name__, log_level="DEBUG")
+
+
+@dataclass
+class ARCroco3DStereoOutput(ModelOutput):
+    """
+    Custom output class for ARCroco3DStereo.
+    """
+
+    ress: Optional[List[Any]] = None
+    views: Optional[List[Any]] = None
+
+
+def strip_module(state_dict):
+    """
+    Removes the 'module.' prefix from the keys of a state_dict.
+    Args:
+        state_dict (dict): The original state_dict with possible 'module.' prefixes.
+    Returns:
+        OrderedDict: A new state_dict with 'module.' prefixes removed.
+    """
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        name = k[7:] if k.startswith("module.") else k
+        new_state_dict[name] = v
+    return new_state_dict
+
+
+def load_model(model_path, device, verbose=True):
+    if verbose:
+        print("... loading model from", model_path)
+    ckpt = torch.load(model_path, map_location="cpu", weights_only=False)
+    args = ckpt["args"].model.replace(
+        "ManyAR_PatchEmbed", "PatchEmbedDust3R"
+    )  # ManyAR only for aspect ratio not consistent
+    if "landscape_only" not in args:
+        args = args[:-2] + ", landscape_only=False))"
+    else:
+        args = args.replace(" ", "").replace(
+            "landscape_only=True", "landscape_only=False"
+        )
+    assert "landscape_only=False" in args
+    if verbose:
+        print(f"instantiating : {args}")
+    net = eval(args)
+    s = net.load_state_dict(ckpt["model"], strict=False)
+    if verbose:
+        print(s)
+    return net.to(device)
+
+
+class ARCroco3DStereoConfig(PretrainedConfig):
+    model_type = "arcroco_3d_stereo"
+
+    def __init__(
+        self,
+        output_mode="pts3d",
+        head_type="linear",  # or dpt
+        depth_mode=("exp", -float("inf"), float("inf")),
+        conf_mode=("exp", 1, float("inf")),
+        pose_mode=("exp", -float("inf"), float("inf")),
+        freeze="none",
+        landscape_only=True,
+        patch_embed_cls="PatchEmbedDust3R",
+        ray_enc_depth=2,
+        state_size=324,
+        local_mem_size=256,
+        state_pe="2d",
+        state_dec_num_heads=16,
+        depth_head=False,
+        rgb_head=False,
+        pose_conf_head=False,
+        pose_head=False,
+        **croco_kwargs,
+    ):
+        super().__init__()
+        self.output_mode = output_mode
+        self.head_type = head_type
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+        self.pose_mode = pose_mode
+        self.freeze = freeze
+        self.landscape_only = landscape_only
+        self.patch_embed_cls = patch_embed_cls
+        self.ray_enc_depth = ray_enc_depth
+        self.state_size = state_size
+        self.state_pe = state_pe
+        self.state_dec_num_heads = state_dec_num_heads
+        self.local_mem_size = local_mem_size
+        self.depth_head = depth_head
+        self.rgb_head = rgb_head
+        self.pose_conf_head = pose_conf_head
+        self.pose_head = pose_head
+        self.croco_kwargs = croco_kwargs
+
+
+class LocalMemory(nn.Module):
+    def __init__(
+        self,
+        size,
+        k_dim,
+        v_dim,
+        num_heads,
+        depth=2,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        norm_mem=True,
+        rope=None,
+    ) -> None:
+        super().__init__()
+        self.v_dim = v_dim
+        self.proj_q = nn.Linear(k_dim, v_dim)
+        self.masked_token = nn.Parameter(
+            torch.randn(1, 1, v_dim) * 0.2, requires_grad=True
+        )
+        self.mem = nn.Parameter(
+            torch.randn(1, size, 2 * v_dim) * 0.2, requires_grad=True
+        )
+        self.write_blocks = nn.ModuleList(
+            [
+                DecoderBlock(
+                    2 * v_dim,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    norm_layer=norm_layer,
+                    attn_drop=attn_drop,
+                    drop=drop,
+                    drop_path=drop_path,
+                    act_layer=act_layer,
+                    norm_mem=norm_mem,
+                    rope=rope,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.read_blocks = nn.ModuleList(
+            [
+                DecoderBlock(
+                    2 * v_dim,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    norm_layer=norm_layer,
+                    attn_drop=attn_drop,
+                    drop=drop,
+                    drop_path=drop_path,
+                    act_layer=act_layer,
+                    norm_mem=norm_mem,
+                    rope=rope,
+                )
+                for _ in range(depth)
+            ]
+        )
+
+    def update_mem(self, mem, feat_k, feat_v):
+        """
+        mem_k: [B, size, C]
+        mem_v: [B, size, C]
+        feat_k: [B, 1, C]
+        feat_v: [B, 1, C]
+        """
+        feat_k = self.proj_q(feat_k)  # [B, 1, C]
+        feat = torch.cat([feat_k, feat_v], dim=-1)
+        for blk in self.write_blocks:
+            mem, _ = blk(mem, feat, None, None)
+        return mem
+
+    def inquire(self, query, mem):
+        x = self.proj_q(query)  # [B, 1, C]
+        x = torch.cat([x, self.masked_token.expand(x.shape[0], -1, -1)], dim=-1)
+        for blk in self.read_blocks:
+            x, _ = blk(x, mem, None, None)
+        return x[..., -self.v_dim :]
+
+
+class ARCroco3DStereo(CroCoNet):
+    config_class = ARCroco3DStereoConfig
+    base_model_prefix = "arcroco3dstereo"
+    supports_gradient_checkpointing = True
+
+    def __init__(self, config: ARCroco3DStereoConfig):
+        self.gradient_checkpointing = False
+        self.fixed_input_length = True
+        config.croco_kwargs = fill_default_args(
+            config.croco_kwargs, CrocoConfig.__init__
+        )
+        self.config = config
+        self.patch_embed_cls = config.patch_embed_cls
+        self.croco_args = config.croco_kwargs
+        croco_cfg = CrocoConfig(**self.croco_args)
+        super().__init__(croco_cfg)
+        self.enc_blocks_ray_map = nn.ModuleList(
+            [
+                Block(
+                    self.enc_embed_dim,
+                    16,
+                    4,
+                    qkv_bias=True,
+                    norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                    rope=self.rope,
+                )
+                for _ in range(config.ray_enc_depth)
+            ]
+        )
+        self.enc_norm_ray_map = nn.LayerNorm(self.enc_embed_dim, eps=1e-6)
+        self.dec_num_heads = self.croco_args["dec_num_heads"]
+        self.pose_head_flag = config.pose_head
+        if self.pose_head_flag:
+            self.pose_token = nn.Parameter(
+                torch.randn(1, 1, self.dec_embed_dim) * 0.02, requires_grad=True
+            )
+            self.pose_retriever = LocalMemory(
+                size=config.local_mem_size,
+                k_dim=self.enc_embed_dim,
+                v_dim=self.dec_embed_dim,
+                num_heads=self.dec_num_heads,
+                mlp_ratio=4,
+                qkv_bias=True,
+                attn_drop=0.0,
+                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                rope=None,
+            )
+        self.register_tokens = nn.Embedding(config.state_size, self.enc_embed_dim)
+        self.state_size = config.state_size
+        self.state_pe = config.state_pe
+        self.masked_img_token = nn.Parameter(
+            torch.randn(1, self.enc_embed_dim) * 0.02, requires_grad=True
+        )
+        self.masked_ray_map_token = nn.Parameter(
+            torch.randn(1, self.enc_embed_dim) * 0.02, requires_grad=True
+        )
+        self._set_state_decoder(
+            self.enc_embed_dim,
+            self.dec_embed_dim,
+            config.state_dec_num_heads,
+            self.dec_depth,
+            self.croco_args.get("mlp_ratio", None),
+            self.croco_args.get("norm_layer", None),
+            self.croco_args.get("norm_im2_in_dec", None),
+        )
+        self.set_downstream_head(
+            config.output_mode,
+            config.head_type,
+            config.landscape_only,
+            config.depth_mode,
+            config.conf_mode,
+            config.pose_mode,
+            config.depth_head,
+            config.rgb_head,
+            config.pose_conf_head,
+            config.pose_head,
+            **self.croco_args,
+        )
+        self.set_freeze(config.freeze)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kw):
+        if os.path.isfile(pretrained_model_name_or_path):
+            return load_model(pretrained_model_name_or_path, device="cpu")
+        else:
+            try:
+                model = super(ARCroco3DStereo, cls).from_pretrained(
+                    pretrained_model_name_or_path, **kw
+                )
+            except TypeError as e:
+                raise Exception(
+                    f"tried to load {pretrained_model_name_or_path} from huggingface, but failed"
+                )
+            return model
+
+    def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
+        self.patch_embed = get_patch_embed(
+            self.patch_embed_cls, img_size, patch_size, enc_embed_dim, in_chans=3
+        )
+        self.patch_embed_ray_map = get_patch_embed(
+            self.patch_embed_cls, img_size, patch_size, enc_embed_dim, in_chans=6
+        )
+
+    def _set_decoder(
+        self,
+        enc_embed_dim,
+        dec_embed_dim,
+        dec_num_heads,
+        dec_depth,
+        mlp_ratio,
+        norm_layer,
+        norm_im2_in_dec,
+    ):
+        self.dec_depth = dec_depth
+        self.dec_embed_dim = dec_embed_dim
+        self.decoder_embed = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True)
+        self.dec_blocks = nn.ModuleList(
+            [
+                DecoderBlock(
+                    dec_embed_dim,
+                    dec_num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=True,
+                    norm_layer=norm_layer,
+                    norm_mem=norm_im2_in_dec,
+                    rope=self.rope,
+                )
+                for i in range(dec_depth)
+            ]
+        )
+        self.dec_norm = norm_layer(dec_embed_dim)
+
+    def _set_state_decoder(
+        self,
+        enc_embed_dim,
+        dec_embed_dim,
+        dec_num_heads,
+        dec_depth,
+        mlp_ratio,
+        norm_layer,
+        norm_im2_in_dec,
+    ):
+        self.dec_depth_state = dec_depth
+        self.dec_embed_dim_state = dec_embed_dim
+        self.decoder_embed_state = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True)
+        self.dec_blocks_state = nn.ModuleList(
+            [
+                DecoderBlock(
+                    dec_embed_dim,
+                    dec_num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=True,
+                    norm_layer=norm_layer,
+                    norm_mem=norm_im2_in_dec,
+                    rope=self.rope,
+                )
+                for i in range(dec_depth)
+            ]
+        )
+        self.dec_norm_state = norm_layer(dec_embed_dim)
+
+    def load_state_dict(self, ckpt, **kw):
+        if all(k.startswith("module") for k in ckpt):
+            ckpt = strip_module(ckpt)
+        new_ckpt = dict(ckpt)
+        if not any(k.startswith("dec_blocks_state") for k in ckpt):
+            for key, value in ckpt.items():
+                if key.startswith("dec_blocks"):
+                    new_ckpt[key.replace("dec_blocks", "dec_blocks_state")] = value
+        try:
+            return super().load_state_dict(new_ckpt, **kw)
+        except:
+            try:
+                new_new_ckpt = {
+                    k: v
+                    for k, v in new_ckpt.items()
+                    if not k.startswith("dec_blocks")
+                    and not k.startswith("dec_norm")
+                    and not k.startswith("decoder_embed")
+                }
+                return super().load_state_dict(new_new_ckpt, **kw)
+            except:
+                new_new_ckpt = {}
+                for key in new_ckpt:
+                    if key in self.state_dict():
+                        if new_ckpt[key].size() == self.state_dict()[key].size():
+                            new_new_ckpt[key] = new_ckpt[key]
+                        else:
+                            printer.info(
+                                f"Skipping '{key}': size mismatch (ckpt: {new_ckpt[key].size()}, model: {self.state_dict()[key].size()})"
+                            )
+                    else:
+                        printer.info(f"Skipping '{key}': not found in model")
+                return super().load_state_dict(new_new_ckpt, **kw)
+
+    def set_freeze(self, freeze):  # this is for use by downstream models
+        self.freeze = freeze
+        to_be_frozen = {
+            "none": [],
+            "mask": [self.mask_token] if hasattr(self, "mask_token") else [],
+            "encoder": [
+                self.patch_embed,
+                self.patch_embed_ray_map,
+                self.masked_img_token,
+                self.masked_ray_map_token,
+                self.enc_blocks,
+                self.enc_blocks_ray_map,
+                self.enc_norm,
+                self.enc_norm_ray_map,
+            ],
+            "encoder_and_head": [
+                self.patch_embed,
+                self.patch_embed_ray_map,
+                self.masked_img_token,
+                self.masked_ray_map_token,
+                self.enc_blocks,
+                self.enc_blocks_ray_map,
+                self.enc_norm,
+                self.enc_norm_ray_map,
+                self.downstream_head,
+            ],
+            "encoder_and_decoder": [
+                self.patch_embed,
+                self.patch_embed_ray_map,
+                self.masked_img_token,
+                self.masked_ray_map_token,
+                self.enc_blocks,
+                self.enc_blocks_ray_map,
+                self.enc_norm,
+                self.enc_norm_ray_map,
+                self.dec_blocks,
+                self.dec_blocks_state,
+                self.pose_retriever,
+                self.pose_token,
+                self.register_tokens,
+                self.decoder_embed_state,
+                self.decoder_embed,
+                self.dec_norm,
+                self.dec_norm_state,
+            ],
+            "decoder": [
+                self.dec_blocks,
+                self.dec_blocks_state,
+                self.pose_retriever,
+                self.pose_token,
+            ],
+        }
+        freeze_all_params(to_be_frozen[freeze])
+
+    def _set_prediction_head(self, *args, **kwargs):
+        """No prediction head"""
+        return
+
+    def set_downstream_head(
+        self,
+        output_mode,
+        head_type,
+        landscape_only,
+        depth_mode,
+        conf_mode,
+        pose_mode,
+        depth_head,
+        rgb_head,
+        pose_conf_head,
+        pose_head,
+        patch_size,
+        img_size,
+        **kw,
+    ):
+        assert (
+            img_size[0] % patch_size == 0 and img_size[1] % patch_size == 0
+        ), f"{img_size=} must be multiple of {patch_size=}"
+        self.output_mode = output_mode
+        self.head_type = head_type
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+        self.pose_mode = pose_mode
+        self.downstream_head = head_factory(
+            head_type,
+            output_mode,
+            self,
+            has_conf=bool(conf_mode),
+            has_depth=bool(depth_head),
+            has_rgb=bool(rgb_head),
+            has_pose_conf=bool(pose_conf_head),
+            has_pose=bool(pose_head),
+        )
+        self.head = transpose_to_landscape(
+            self.downstream_head, activate=landscape_only
+        )
+
+    def _encode_image(self, image, true_shape):
+        x, pos = self.patch_embed(image, true_shape=true_shape)
+        assert self.enc_pos_embed is None
+        for blk in self.enc_blocks:
+            if self.gradient_checkpointing and self.training:
+                x = checkpoint(blk, x, pos, use_reentrant=False)
+            else:
+                x = blk(x, pos)
+        x = self.enc_norm(x)
+        return [x], pos, None
+
+    def _encode_ray_map(self, ray_map, true_shape):
+        x, pos = self.patch_embed_ray_map(ray_map, true_shape=true_shape)
+        assert self.enc_pos_embed is None
+        for blk in self.enc_blocks_ray_map:
+            if self.gradient_checkpointing and self.training:
+                x = checkpoint(blk, x, pos, use_reentrant=False)
+            else:
+                x = blk(x, pos)
+        x = self.enc_norm_ray_map(x)
+        return [x], pos, None
+
+    def _encode_state(self, image_tokens, image_pos):
+        batch_size = image_tokens.shape[0]
+        state_feat = self.register_tokens(
+            torch.arange(self.state_size, device=image_pos.device)
+        )
+        if self.state_pe == "1d":
+            state_pos = (
+                torch.tensor(
+                    [[i, i] for i in range(self.state_size)],
+                    dtype=image_pos.dtype,
+                    device=image_pos.device,
+                )[None]
+                .expand(batch_size, -1, -1)
+                .contiguous()
+            )  # .long()
+        elif self.state_pe == "2d":
+            width = int(self.state_size**0.5)
+            width = width + 1 if width % 2 == 1 else width
+            state_pos = (
+                torch.tensor(
+                    [[i // width, i % width] for i in range(self.state_size)],
+                    dtype=image_pos.dtype,
+                    device=image_pos.device,
+                )[None]
+                .expand(batch_size, -1, -1)
+                .contiguous()
+            )
+        elif self.state_pe == "none":
+            state_pos = None
+        state_feat = state_feat[None].expand(batch_size, -1, -1)
+        return state_feat, state_pos, None
+
+    def _encode_views(self, views, img_mask=None, ray_mask=None):
+        device = views[0]["img"].device
+        batch_size = views[0]["img"].shape[0]
+        given = True
+        if img_mask is None and ray_mask is None:
+            given = False
+        if not given:
+            img_mask = torch.stack(
+                [view["img_mask"] for view in views], dim=0
+            )  # Shape: (num_views, batch_size)
+            ray_mask = torch.stack(
+                [view["ray_mask"] for view in views], dim=0
+            )  # Shape: (num_views, batch_size)
+        imgs = torch.stack(
+            [view["img"] for view in views], dim=0
+        )  # Shape: (num_views, batch_size, C, H, W)
+        ray_maps = torch.stack(
+            [view["ray_map"] for view in views], dim=0
+        )  # Shape: (num_views, batch_size, H, W, C)
+        shapes = []
+        for view in views:
+            if "true_shape" in view:
+                shapes.append(view["true_shape"])
+            else:
+                shape = torch.tensor(view["img"].shape[-2:], device=device)
+                shapes.append(shape.unsqueeze(0).repeat(batch_size, 1))
+        shapes = torch.stack(shapes, dim=0).to(
+            imgs.device
+        )  # Shape: (num_views, batch_size, 2)
+        imgs = imgs.view(
+            -1, *imgs.shape[2:]
+        )  # Shape: (num_views * batch_size, C, H, W)
+        ray_maps = ray_maps.view(
+            -1, *ray_maps.shape[2:]
+        )  # Shape: (num_views * batch_size, H, W, C)
+        shapes = shapes.view(-1, 2)  # Shape: (num_views * batch_size, 2)
+        img_masks_flat = img_mask.view(-1)  # Shape: (num_views * batch_size)
+        ray_masks_flat = ray_mask.view(-1)
+        selected_imgs = imgs[img_masks_flat]
+        selected_shapes = shapes[img_masks_flat]
+        if selected_imgs.size(0) > 0:
+            img_out, img_pos, _ = self._encode_image(selected_imgs, selected_shapes)
+        else:
+            raise NotImplementedError
+        full_out = [
+            torch.zeros(
+                len(views) * batch_size, *img_out[0].shape[1:], device=img_out[0].device
+            )
+            for _ in range(len(img_out))
+        ]
+        full_pos = torch.zeros(
+            len(views) * batch_size,
+            *img_pos.shape[1:],
+            device=img_pos.device,
+            dtype=img_pos.dtype,
+        )
+        for i in range(len(img_out)):
+            full_out[i][img_masks_flat] += img_out[i]
+            full_out[i][~img_masks_flat] += self.masked_img_token
+        full_pos[img_masks_flat] += img_pos
+        ray_maps = ray_maps.permute(0, 3, 1, 2)  # Change shape to (N, C, H, W)
+        selected_ray_maps = ray_maps[ray_masks_flat]
+        selected_shapes_ray = shapes[ray_masks_flat]
+        if selected_ray_maps.size(0) > 0:
+            ray_out, ray_pos, _ = self._encode_ray_map(
+                selected_ray_maps, selected_shapes_ray
+            )
+            assert len(ray_out) == len(full_out), f"{len(ray_out)}, {len(full_out)}"
+            for i in range(len(ray_out)):
+                full_out[i][ray_masks_flat] += ray_out[i]
+                full_out[i][~ray_masks_flat] += self.masked_ray_map_token
+            full_pos[ray_masks_flat] += (
+                ray_pos * (~img_masks_flat[ray_masks_flat][:, None, None]).long()
+            )
+        else:
+            raymaps = torch.zeros(
+                1, 6, imgs[0].shape[-2], imgs[0].shape[-1], device=img_out[0].device
+            )
+            ray_mask_flat = torch.zeros_like(img_masks_flat)
+            ray_mask_flat[:1] = True
+            ray_out, ray_pos, _ = self._encode_ray_map(raymaps, shapes[ray_mask_flat])
+            for i in range(len(ray_out)):
+                full_out[i][ray_mask_flat] += ray_out[i] * 0.0
+                full_out[i][~ray_mask_flat] += self.masked_ray_map_token * 0.0
+        return (
+            shapes.chunk(len(views), dim=0),
+            [out.chunk(len(views), dim=0) for out in full_out],
+            full_pos.chunk(len(views), dim=0),
+        )
+
+    def _decoder(self, f_state, pos_state, f_img, pos_img, f_pose, pos_pose):
+        final_output = [(f_state, f_img)]  # before projection
+        assert f_state.shape[-1] == self.dec_embed_dim
+        f_img = self.decoder_embed(f_img)
+        if self.pose_head_flag:
+            assert f_pose is not None and pos_pose is not None
+            f_img = torch.cat([f_pose, f_img], dim=1)
+            pos_img = torch.cat([pos_pose, pos_img], dim=1)
+        final_output.append((f_state, f_img))
+        for blk_state, blk_img in zip(self.dec_blocks_state, self.dec_blocks):
+            if (
+                self.gradient_checkpointing
+                and self.training
+                and torch.is_grad_enabled()
+            ):
+                f_state, _ = checkpoint(
+                    blk_state,
+                    *final_output[-1][::+1],
+                    pos_state,
+                    pos_img,
+                    use_reentrant=not self.fixed_input_length,
+                )
+                f_img, _ = checkpoint(
+                    blk_img,
+                    *final_output[-1][::-1],
+                    pos_img,
+                    pos_state,
+                    use_reentrant=not self.fixed_input_length,
+                )
+            else:
+                f_state, _ = blk_state(*final_output[-1][::+1], pos_state, pos_img)
+                f_img, _ = blk_img(*final_output[-1][::-1], pos_img, pos_state)
+            final_output.append((f_state, f_img))
+        del final_output[1]  # duplicate with final_output[0]
+        final_output[-1] = (
+            self.dec_norm_state(final_output[-1][0]),
+            self.dec_norm(final_output[-1][1]),
+        )
+        return zip(*final_output)
+
+    def _downstream_head(self, decout, img_shape, **kwargs):
+        B, S, D = decout[-1].shape
+        head = getattr(self, f"head")
+        return head(decout, img_shape, **kwargs)
+
+    def _init_state(self, image_tokens, image_pos):
+        """
+        Current Version: input the first frame img feature and pose to initialize the state feature and pose
+        """
+        state_feat, state_pos, _ = self._encode_state(image_tokens, image_pos)
+        state_feat = self.decoder_embed_state(state_feat)
+        return state_feat, state_pos
+
+    def _recurrent_rollout(
+        self,
+        state_feat,
+        state_pos,
+        current_feat,
+        current_pos,
+        pose_feat,
+        pose_pos,
+        init_state_feat,
+        img_mask=None,
+        reset_mask=None,
+        update=None,
+    ):
+        new_state_feat, dec = self._decoder(
+            state_feat, state_pos, current_feat, current_pos, pose_feat, pose_pos
+        )
+        new_state_feat = new_state_feat[-1]
+        return new_state_feat, dec
+
+    def _get_img_level_feat(self, feat):
+        return torch.mean(feat, dim=1, keepdim=True)
+
+    def _forward_encoder(self, views):
+        shape, feat_ls, pos = self._encode_views(views)
+        feat = feat_ls[-1]
+        state_feat, state_pos = self._init_state(feat[0], pos[0])
+        mem = self.pose_retriever.mem.expand(feat[0].shape[0], -1, -1)
+        init_state_feat = state_feat.clone()
+        init_mem = mem.clone()
+        return (feat, pos, shape), (
+            init_state_feat,
+            init_mem,
+            state_feat,
+            state_pos,
+            mem,
+        )
+
+    def _forward_decoder_step(
+        self,
+        views,
+        i,
+        feat_i,
+        pos_i,
+        shape_i,
+        init_state_feat,
+        init_mem,
+        state_feat,
+        state_pos,
+        mem,
+    ):
+        if self.pose_head_flag:
+            global_img_feat_i = self._get_img_level_feat(feat_i)
+            if i == 0:
+                pose_feat_i = self.pose_token.expand(feat_i.shape[0], -1, -1)
+            else:
+                pose_feat_i = self.pose_retriever.inquire(global_img_feat_i, mem)
+            pose_pos_i = -torch.ones(
+                feat_i.shape[0], 1, 2, device=feat_i.device, dtype=pos_i.dtype
+            )
+        else:
+            pose_feat_i = None
+            pose_pos_i = None
+        new_state_feat, dec = self._recurrent_rollout(
+            state_feat,
+            state_pos,
+            feat_i,
+            pos_i,
+            pose_feat_i,
+            pose_pos_i,
+            init_state_feat,
+            img_mask=views[i]["img_mask"],
+            reset_mask=views[i]["reset"],
+            update=views[i].get("update", None),
+        )
+        out_pose_feat_i = dec[-1][:, 0:1]
+        new_mem = self.pose_retriever.update_mem(
+            mem, global_img_feat_i, out_pose_feat_i
+        )
+        head_input = [
+            dec[0].float(),
+            dec[self.dec_depth * 2 // 4][:, 1:].float(),
+            dec[self.dec_depth * 3 // 4][:, 1:].float(),
+            dec[self.dec_depth].float(),
+        ]
+        res = self._downstream_head(head_input, shape_i, pos=pos_i)
+        img_mask = views[i]["img_mask"]
+        update = views[i].get("update", None)
+        if update is not None:
+            update_mask = img_mask & update  # if don't update, then whatever img_mask
+        else:
+            update_mask = img_mask
+        update_mask = update_mask[:, None, None].float()
+        state_feat = new_state_feat * update_mask + state_feat * (
+            1 - update_mask
+        )  # update global state
+        mem = new_mem * update_mask + mem * (1 - update_mask)  # then update local state
+        reset_mask = views[i]["reset"]
+        if reset_mask is not None:
+            reset_mask = reset_mask[:, None, None].float()
+            state_feat = init_state_feat * reset_mask + state_feat * (1 - reset_mask)
+            mem = init_mem * reset_mask + mem * (1 - reset_mask)
+        return res, (state_feat, mem)
+
+    def _forward_impl(self, views, ret_state=False):
+        shape, feat_ls, pos = self._encode_views(views)
+        feat = feat_ls[-1]
+        state_feat, state_pos = self._init_state(feat[0], pos[0])
+        mem = self.pose_retriever.mem.expand(feat[0].shape[0], -1, -1)
+        init_state_feat = state_feat.clone()
+        init_mem = mem.clone()
+        all_state_args = [(state_feat, state_pos, init_state_feat, mem, init_mem)]
+        ress = []
+        for i in range(len(views)):
+            feat_i = feat[i]
+            pos_i = pos[i]
+            if self.pose_head_flag:
+                global_img_feat_i = self._get_img_level_feat(feat_i)
+                if i == 0:
+                    pose_feat_i = self.pose_token.expand(feat_i.shape[0], -1, -1)
+                else:
+                    pose_feat_i = self.pose_retriever.inquire(global_img_feat_i, mem)
+                pose_pos_i = -torch.ones(
+                    feat_i.shape[0], 1, 2, device=feat_i.device, dtype=pos_i.dtype
+                )
+            else:
+                pose_feat_i = None
+                pose_pos_i = None
+            new_state_feat, dec = self._recurrent_rollout(
+                state_feat,
+                state_pos,
+                feat_i,
+                pos_i,
+                pose_feat_i,
+                pose_pos_i,
+                init_state_feat,
+                img_mask=views[i]["img_mask"],
+                reset_mask=views[i]["reset"],
+                update=views[i].get("update", None),
+            )
+            out_pose_feat_i = dec[-1][:, 0:1]
+            new_mem = self.pose_retriever.update_mem(
+                mem, global_img_feat_i, out_pose_feat_i
+            )
+            assert len(dec) == self.dec_depth + 1
+            head_input = [
+                dec[0].float(),
+                dec[self.dec_depth * 2 // 4][:, 1:].float(),
+                dec[self.dec_depth * 3 // 4][:, 1:].float(),
+                dec[self.dec_depth].float(),
+            ]
+            res = self._downstream_head(head_input, shape[i], pos=pos_i)
+            ress.append(res)
+            img_mask = views[i]["img_mask"]
+            update = views[i].get("update", None)
+            if update is not None:
+                update_mask = (
+                    img_mask & update
+                )  # if don't update, then whatever img_mask
+            else:
+                update_mask = img_mask
+            update_mask = update_mask[:, None, None].float()
+            state_feat = new_state_feat * update_mask + state_feat * (
+                1 - update_mask
+            )  # update global state
+            mem = new_mem * update_mask + mem * (
+                1 - update_mask
+            )  # then update local state
+            reset_mask = views[i]["reset"]
+            if reset_mask is not None:
+                reset_mask = reset_mask[:, None, None].float()
+                state_feat = init_state_feat * reset_mask + state_feat * (
+                    1 - reset_mask
+                )
+                mem = init_mem * reset_mask + mem * (1 - reset_mask)
+            all_state_args.append(
+                (state_feat, state_pos, init_state_feat, mem, init_mem)
+            )
+        if ret_state:
+            return ress, views, all_state_args
+        return ress, views
+
+    def forward(self, views, ret_state=False):
+        if ret_state:
+            ress, views, state_args = self._forward_impl(views, ret_state=ret_state)
+            return ARCroco3DStereoOutput(ress=ress, views=views), state_args
+        else:
+            ress, views = self._forward_impl(views, ret_state=ret_state)
+            return ARCroco3DStereoOutput(ress=ress, views=views)
+
+    def inference_step(
+        self, view, state_feat, state_pos, init_state_feat, mem, init_mem
+    ):
+        batch_size = view["img"].shape[0]
+        raymaps = []
+        shapes = []
+        for j in range(batch_size):
+            assert view["ray_mask"][j]
+            raymap = view["ray_map"][[j]].permute(0, 3, 1, 2)
+            raymaps.append(raymap)
+            shapes.append(
+                view.get(
+                    "true_shape",
+                    torch.tensor(view["ray_map"].shape[-2:])[None].repeat(
+                        view["ray_map"].shape[0], 1
+                    ),
+                )[[j]]
+            )
+
+        raymaps = torch.cat(raymaps, dim=0)
+        shape = torch.cat(shapes, dim=0).to(raymaps.device)
+        feat_ls, pos, _ = self._encode_ray_map(raymaps, shapes)
+
+        feat_i = feat_ls[-1]
+        pos_i = pos
+        if self.pose_head_flag:
+            global_img_feat_i = self._get_img_level_feat(feat_i)
+            pose_feat_i = self.pose_retriever.inquire(global_img_feat_i, mem)
+            pose_pos_i = -torch.ones(
+                feat_i.shape[0], 1, 2, device=feat_i.device, dtype=pos_i.dtype
+            )
+        else:
+            pose_feat_i = None
+            pose_pos_i = None
+        new_state_feat, dec = self._recurrent_rollout(
+            state_feat,
+            state_pos,
+            feat_i,
+            pos_i,
+            pose_feat_i,
+            pose_pos_i,
+            init_state_feat,
+            img_mask=view["img_mask"],
+            reset_mask=view["reset"],
+            update=view.get("update", None),
+        )
+
+        out_pose_feat_i = dec[-1][:, 0:1]
+        new_mem = self.pose_retriever.update_mem(
+            mem, global_img_feat_i, out_pose_feat_i
+        )
+        assert len(dec) == self.dec_depth + 1
+        head_input = [
+            dec[0].float(),
+            dec[self.dec_depth * 2 // 4][:, 1:].float(),
+            dec[self.dec_depth * 3 // 4][:, 1:].float(),
+            dec[self.dec_depth].float(),
+        ]
+        res = self._downstream_head(head_input, shape, pos=pos_i)
+        return res, view
+
+    def forward_recurrent(self, views, device, ret_state=False):
+        ress = []
+        all_state_args = []
+        for i, view in enumerate(views):
+            device = view["img"].device
+            batch_size = view["img"].shape[0]
+            img_mask = view["img_mask"].reshape(
+                -1, batch_size
+            )  # Shape: (1, batch_size)
+            ray_mask = view["ray_mask"].reshape(
+                -1, batch_size
+            )  # Shape: (1, batch_size)
+            imgs = view["img"].unsqueeze(0)  # Shape: (1, batch_size, C, H, W)
+            ray_maps = view["ray_map"].unsqueeze(
+                0
+            )  # Shape: (num_views, batch_size, H, W, C)
+            shapes = (
+                view["true_shape"].unsqueeze(0)
+                if "true_shape" in view
+                else torch.tensor(view["img"].shape[-2:], device=device)
+                .unsqueeze(0)
+                .repeat(batch_size, 1)
+                .unsqueeze(0)
+            )  # Shape: (num_views, batch_size, 2)
+            imgs = imgs.view(
+                -1, *imgs.shape[2:]
+            )  # Shape: (num_views * batch_size, C, H, W)
+            ray_maps = ray_maps.view(
+                -1, *ray_maps.shape[2:]
+            )  # Shape: (num_views * batch_size, H, W, C)
+            shapes = shapes.view(-1, 2).to(
+                imgs.device
+            )  # Shape: (num_views * batch_size, 2)
+            img_masks_flat = img_mask.view(-1)  # Shape: (num_views * batch_size)
+            ray_masks_flat = ray_mask.view(-1)
+            selected_imgs = imgs[img_masks_flat]
+            selected_shapes = shapes[img_masks_flat]
+            if selected_imgs.size(0) > 0:
+                img_out, img_pos, _ = self._encode_image(selected_imgs, selected_shapes)
+            else:
+                img_out, img_pos = None, None
+            ray_maps = ray_maps.permute(0, 3, 1, 2)  # Change shape to (N, C, H, W)
+            selected_ray_maps = ray_maps[ray_masks_flat]
+            selected_shapes_ray = shapes[ray_masks_flat]
+            if selected_ray_maps.size(0) > 0:
+                ray_out, ray_pos, _ = self._encode_ray_map(
+                    selected_ray_maps, selected_shapes_ray
+                )
+            else:
+                ray_out, ray_pos = None, None
+
+            shape = shapes
+            if img_out is not None and ray_out is None:
+                feat_i = img_out[-1]
+                pos_i = img_pos
+            elif img_out is None and ray_out is not None:
+                feat_i = ray_out[-1]
+                pos_i = ray_pos
+            elif img_out is not None and ray_out is not None:
+                feat_i = img_out[-1] + ray_out[-1]
+                pos_i = img_pos
+            else:
+                raise NotImplementedError
+
+            if i == 0:
+                state_feat, state_pos = self._init_state(feat_i, pos_i)
+                mem = self.pose_retriever.mem.expand(feat_i.shape[0], -1, -1)
+                init_state_feat = state_feat.clone()
+                init_mem = mem.clone()
+                all_state_args.append(
+                    (state_feat, state_pos, init_state_feat, mem, init_mem)
+                )
+
+            if self.pose_head_flag:
+                global_img_feat_i = self._get_img_level_feat(feat_i)
+                if i == 0:
+                    pose_feat_i = self.pose_token.expand(feat_i.shape[0], -1, -1)
+                else:
+                    pose_feat_i = self.pose_retriever.inquire(global_img_feat_i, mem)
+                pose_pos_i = -torch.ones(
+                    feat_i.shape[0], 1, 2, device=feat_i.device, dtype=pos_i.dtype
+                )
+            else:
+                pose_feat_i = None
+                pose_pos_i = None
+            new_state_feat, dec = self._recurrent_rollout(
+                state_feat,
+                state_pos,
+                feat_i,
+                pos_i,
+                pose_feat_i,
+                pose_pos_i,
+                init_state_feat,
+                img_mask=view["img_mask"],
+                reset_mask=view["reset"],
+                update=view.get("update", None),
+            )
+            out_pose_feat_i = dec[-1][:, 0:1]
+            new_mem = self.pose_retriever.update_mem(
+                mem, global_img_feat_i, out_pose_feat_i
+            )
+            assert len(dec) == self.dec_depth + 1
+            head_input = [
+                dec[0].float(),
+                dec[self.dec_depth * 2 // 4][:, 1:].float(),
+                dec[self.dec_depth * 3 // 4][:, 1:].float(),
+                dec[self.dec_depth].float(),
+            ]
+            res = self._downstream_head(head_input, shape, pos=pos_i)
+            ress.append(res)
+            img_mask = view["img_mask"]
+            update = view.get("update", None)
+            if update is not None:
+                update_mask = (
+                    img_mask & update
+                )  # if don't update, then whatever img_mask
+            else:
+                update_mask = img_mask
+            update_mask = update_mask[:, None, None].float()
+            state_feat = new_state_feat * update_mask + state_feat * (
+                1 - update_mask
+            )  # update global state
+            mem = new_mem * update_mask + mem * (
+                1 - update_mask
+            )  # then update local state
+            reset_mask = view["reset"]
+            if reset_mask is not None:
+                reset_mask = reset_mask[:, None, None].float()
+                state_feat = init_state_feat * reset_mask + state_feat * (
+                    1 - reset_mask
+                )
+                mem = init_mem * reset_mask + mem * (1 - reset_mask)
+            all_state_args.append(
+                (state_feat, state_pos, init_state_feat, mem, init_mem)
+            )
+        if ret_state:
+            return ress, views, all_state_args
+        return ress, views
+
+
+if __name__ == "__main__":
+    print(ARCroco3DStereo.mro())
+    cfg = ARCroco3DStereoConfig(
+        state_size=256,
+        pos_embed="RoPE100",
+        rgb_head=True,
+        pose_head=True,
+        img_size=(224, 224),
+        head_type="linear",
+        output_mode="pts3d+pose",
+        depth_mode=("exp", -inf, inf),
+        conf_mode=("exp", 1, inf),
+        pose_mode=("exp", -inf, inf),
+        enc_embed_dim=1024,
+        enc_depth=24,
+        enc_num_heads=16,
+        dec_embed_dim=768,
+        dec_depth=12,
+        dec_num_heads=12,
+    )
+    ARCroco3DStereo(cfg)
diff --git a/extern/CUT3R/src/dust3r/patch_embed.py b/extern/CUT3R/src/dust3r/patch_embed.py
new file mode 100755
index 0000000000000000000000000000000000000000..6cc177f0b05940b5e9ee01b9053fbf24be6d1905
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/patch_embed.py
@@ -0,0 +1,93 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+import torch
+import dust3r.utils.path_to_croco  # noqa: F401
+from models.blocks import PatchEmbed  # noqa
+
+
+def get_patch_embed(patch_embed_cls, img_size, patch_size, enc_embed_dim, in_chans=3):
+    assert patch_embed_cls in ["PatchEmbedDust3R", "ManyAR_PatchEmbed"]
+    patch_embed = eval(patch_embed_cls)(img_size, patch_size, in_chans, enc_embed_dim)
+    return patch_embed
+
+
+class PatchEmbedDust3R(PatchEmbed):
+    def forward(self, x, **kw):
+        B, C, H, W = x.shape
+        assert (
+            H % self.patch_size[0] == 0
+        ), f"Input image height ({H}) is not a multiple of patch size ({self.patch_size[0]})."
+        assert (
+            W % self.patch_size[1] == 0
+        ), f"Input image width ({W}) is not a multiple of patch size ({self.patch_size[1]})."
+        x = self.proj(x)
+        pos = self.position_getter(B, x.size(2), x.size(3), x.device)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x, pos
+
+
+class ManyAR_PatchEmbed(PatchEmbed):
+    """Handle images with non-square aspect ratio.
+    All images in the same batch have the same aspect ratio.
+    true_shape = [(height, width) ...] indicates the actual shape of each image.
+    """
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+    ):
+        self.embed_dim = embed_dim
+        super().__init__(img_size, patch_size, in_chans, embed_dim, norm_layer, flatten)
+
+    def forward(self, img, true_shape):
+        B, C, H, W = img.shape
+
+        assert (
+            H % self.patch_size[0] == 0
+        ), f"Input image height ({H}) is not a multiple of patch size ({self.patch_size[0]})."
+        assert (
+            W % self.patch_size[1] == 0
+        ), f"Input image width ({W}) is not a multiple of patch size ({self.patch_size[1]})."
+        assert true_shape.shape == (
+            B,
+            2,
+        ), f"true_shape has the wrong shape={true_shape.shape}"
+
+        W //= self.patch_size[0]
+        H //= self.patch_size[1]
+        n_tokens = H * W
+
+        height, width = true_shape.T
+
+        is_landscape = torch.ones_like(width, dtype=torch.bool)
+        is_portrait = ~is_landscape
+
+        x = img.new_zeros((B, n_tokens, self.embed_dim))
+        pos = img.new_zeros((B, n_tokens, 2), dtype=torch.int64)
+
+        x[is_landscape] = (
+            self.proj(img[is_landscape]).permute(0, 2, 3, 1).flatten(1, 2).float()
+        )
+        x[is_portrait] = (
+            self.proj(img[is_portrait].swapaxes(-1, -2))
+            .permute(0, 2, 3, 1)
+            .flatten(1, 2)
+            .float()
+        )
+
+        pos[is_landscape] = self.position_getter(1, H, W, pos.device)
+        pos[is_portrait] = self.position_getter(1, W, H, pos.device)
+
+        x = self.norm(x)
+        return x, pos
diff --git a/extern/CUT3R/src/dust3r/post_process.py b/extern/CUT3R/src/dust3r/post_process.py
new file mode 100755
index 0000000000000000000000000000000000000000..04a6597b33f2074f32b05477437dde2b940b3532
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/post_process.py
@@ -0,0 +1,64 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+import numpy as np
+import torch
+from dust3r.utils.geometry import xy_grid
+
+
+def estimate_focal_knowing_depth(
+    pts3d, pp, focal_mode="median", min_focal=0.0, max_focal=np.inf
+):
+    """Reprojection method, for when the absolute depth is known:
+    1) estimate the camera focal using a robust estimator
+    2) reproject points onto true rays, minimizing a certain error
+    """
+    B, H, W, THREE = pts3d.shape
+    assert THREE == 3
+
+    pixels = xy_grid(W, H, device=pts3d.device).view(1, -1, 2) - pp.view(
+        -1, 1, 2
+    )  # B,HW,2
+    pts3d = pts3d.flatten(1, 2)  # (B, HW, 3)
+
+    if focal_mode == "median":
+        with torch.no_grad():
+
+            u, v = pixels.unbind(dim=-1)
+            x, y, z = pts3d.unbind(dim=-1)
+            fx_votes = (u * z) / x
+            fy_votes = (v * z) / y
+
+            f_votes = torch.cat((fx_votes.view(B, -1), fy_votes.view(B, -1)), dim=-1)
+            focal = torch.nanmedian(f_votes, dim=-1).values
+
+    elif focal_mode == "weiszfeld":
+
+        xy_over_z = (pts3d[..., :2] / pts3d[..., 2:3]).nan_to_num(
+            posinf=0, neginf=0
+        )  # homogeneous (x,y,1)
+
+        dot_xy_px = (xy_over_z * pixels).sum(dim=-1)
+        dot_xy_xy = xy_over_z.square().sum(dim=-1)
+
+        focal = dot_xy_px.mean(dim=1) / dot_xy_xy.mean(dim=1)
+
+        for iter in range(10):
+
+            dis = (pixels - focal.view(-1, 1, 1) * xy_over_z).norm(dim=-1)
+
+            w = dis.clip(min=1e-8).reciprocal()
+
+            focal = (w * dot_xy_px).mean(dim=1) / (w * dot_xy_xy).mean(dim=1)
+    else:
+        raise ValueError(f"bad {focal_mode=}")
+
+    focal_base = max(H, W) / (
+        2 * np.tan(np.deg2rad(60) / 2)
+    )  # size / 1.1547005383792515
+    focal = focal.clip(min=min_focal * focal_base, max=max_focal * focal_base)
+
+    return focal
diff --git a/extern/CUT3R/src/dust3r/utils/__init__.py b/extern/CUT3R/src/dust3r/utils/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/utils/__init__.py
@@ -0,0 +1 @@
+
diff --git a/extern/CUT3R/src/dust3r/utils/camera.py b/extern/CUT3R/src/dust3r/utils/camera.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f85d99a82514fae14cab8929dd1098420bf2057
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/utils/camera.py
@@ -0,0 +1,462 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from croco.models.blocks import Mlp
+from dust3r.heads.postprocess import postprocess_pose
+
+inf = float("inf")
+
+
+class PoseDecoder(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        mlp_ratio=4,
+        pose_encoding_type="absT_quaR",
+    ):
+        super().__init__()
+
+        self.pose_encoding_type = pose_encoding_type
+        if self.pose_encoding_type == "absT_quaR":
+            self.target_dim = 7
+
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=int(hidden_size * mlp_ratio),
+            out_features=self.target_dim,
+            drop=0,
+        )
+
+    def forward(
+        self,
+        pose_feat,
+    ):
+        """
+        pose_feat: BxC
+        preliminary_cameras: cameras in opencv coordinate.
+        """
+
+        pred_cameras = self.mlp(pose_feat)  # Bx7, 3 for absT, 4 for quaR
+        return pred_cameras
+
+
+class PoseEncoder(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        mlp_ratio=4,
+        pose_mode=("exp", -inf, inf),
+        pose_encoding_type="absT_quaR",
+    ):
+        super().__init__()
+        self.pose_encoding_type = pose_encoding_type
+        self.pose_mode = pose_mode
+
+        if self.pose_encoding_type == "absT_quaR":
+            self.target_dim = 7
+
+        self.embed_pose = PoseEmbedding(
+            target_dim=self.target_dim,
+            out_dim=hidden_size,
+            n_harmonic_functions=10,
+            append_input=True,
+        )
+        self.pose_encoder = Mlp(
+            in_features=self.embed_pose.out_dim,
+            hidden_features=int(hidden_size * mlp_ratio),
+            out_features=hidden_size,
+            drop=0,
+        )
+
+    def forward(self, camera):
+        pose_enc = camera_to_pose_encoding(
+            camera,
+            pose_encoding_type=self.pose_encoding_type,
+        ).to(camera.dtype)
+        pose_enc = postprocess_pose(pose_enc, self.pose_mode, inverse=True)
+        pose_feat = self.embed_pose(pose_enc)
+        pose_feat = self.pose_encoder(pose_feat)
+        return pose_feat
+
+
+class HarmonicEmbedding(torch.nn.Module):
+    def __init__(
+        self,
+        n_harmonic_functions: int = 6,
+        omega_0: float = 1.0,
+        logspace: bool = True,
+        append_input: bool = True,
+    ) -> None:
+        """
+        The harmonic embedding layer supports the classical
+        Nerf positional encoding described in
+        `NeRF <https://arxiv.org/abs/2003.08934>`_
+        and the integrated position encoding in
+        `MIP-NeRF <https://arxiv.org/abs/2103.13415>`_.
+
+        During the inference you can provide the extra argument `diag_cov`.
+
+        If `diag_cov is None`, it converts
+        rays parametrized with a `ray_bundle` to 3D points by
+        extending each ray according to the corresponding length.
+        Then it converts each feature
+        (i.e. vector along the last dimension) in `x`
+        into a series of harmonic features `embedding`,
+        where for each i in range(dim) the following are present
+        in embedding[...]::
+
+            [
+                sin(f_1*x[..., i]),
+                sin(f_2*x[..., i]),
+                ...
+                sin(f_N * x[..., i]),
+                cos(f_1*x[..., i]),
+                cos(f_2*x[..., i]),
+                ...
+                cos(f_N * x[..., i]),
+                x[..., i],              # only present if append_input is True.
+            ]
+
+        where N corresponds to `n_harmonic_functions-1`, and f_i is a scalar
+        denoting the i-th frequency of the harmonic embedding.
+
+
+        If `diag_cov is not None`, it approximates
+        conical frustums following a ray bundle as gaussians,
+        defined by x, the means of the gaussians and diag_cov,
+        the diagonal covariances.
+        Then it converts each gaussian
+        into a series of harmonic features `embedding`,
+        where for each i in range(dim) the following are present
+        in embedding[...]::
+
+            [
+                sin(f_1*x[..., i]) * exp(0.5 * f_1**2 * diag_cov[..., i,]),
+                sin(f_2*x[..., i]) * exp(0.5 * f_2**2 * diag_cov[..., i,]),
+                ...
+                sin(f_N * x[..., i]) * exp(0.5 * f_N**2 * diag_cov[..., i,]),
+                cos(f_1*x[..., i]) * exp(0.5 * f_1**2 * diag_cov[..., i,]),
+                cos(f_2*x[..., i]) * exp(0.5 * f_2**2 * diag_cov[..., i,]),,
+                ...
+                cos(f_N * x[..., i]) * exp(0.5 * f_N**2 * diag_cov[..., i,]),
+                x[..., i],              # only present if append_input is True.
+            ]
+
+        where N equals `n_harmonic_functions-1`, and f_i is a scalar
+        denoting the i-th frequency of the harmonic embedding.
+
+        If `logspace==True`, the frequencies `[f_1, ..., f_N]` are
+        powers of 2:
+            `f_1, ..., f_N = 2**torch.arange(n_harmonic_functions)`
+
+        If `logspace==False`, frequencies are linearly spaced between
+        `1.0` and `2**(n_harmonic_functions-1)`:
+            `f_1, ..., f_N = torch.linspace(
+                1.0, 2**(n_harmonic_functions-1), n_harmonic_functions
+            )`
+
+        Note that `x` is also premultiplied by the base frequency `omega_0`
+        before evaluating the harmonic functions.
+
+        Args:
+            n_harmonic_functions: int, number of harmonic
+                features
+            omega_0: float, base frequency
+            logspace: bool, Whether to space the frequencies in
+                logspace or linear space
+            append_input: bool, whether to concat the original
+                input to the harmonic embedding. If true the
+                output is of the form (embed.sin(), embed.cos(), x)
+        """
+        super().__init__()
+
+        if logspace:
+            frequencies = 2.0 ** torch.arange(n_harmonic_functions, dtype=torch.float32)
+        else:
+            frequencies = torch.linspace(
+                1.0,
+                2.0 ** (n_harmonic_functions - 1),
+                n_harmonic_functions,
+                dtype=torch.float32,
+            )
+
+        self.register_buffer("_frequencies", frequencies * omega_0, persistent=False)
+        self.register_buffer(
+            "_zero_half_pi",
+            torch.tensor([0.0, 0.5 * torch.pi]),
+            persistent=False,
+        )
+        self.append_input = append_input
+
+    def forward(
+        self, x: torch.Tensor, diag_cov: Optional[torch.Tensor] = None, **kwargs
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: tensor of shape [..., dim]
+            diag_cov: An optional tensor of shape `(..., dim)`
+                representing the diagonal covariance matrices of our Gaussians, joined with x
+                as means of the Gaussians.
+
+        Returns:
+            embedding: a harmonic embedding of `x` of shape
+            [..., (n_harmonic_functions * 2 + int(append_input)) * num_points_per_ray]
+        """
+
+        embed = x[..., None] * self._frequencies
+
+        embed = embed[..., None, :, :] + self._zero_half_pi[..., None, None]
+
+        embed = embed.sin()
+        if diag_cov is not None:
+            x_var = diag_cov[..., None] * torch.pow(self._frequencies, 2)
+            exp_var = torch.exp(-0.5 * x_var)
+
+            embed = embed * exp_var[..., None, :, :]
+
+        embed = embed.reshape(*x.shape[:-1], -1)
+
+        if self.append_input:
+            return torch.cat([embed, x], dim=-1)
+        return embed
+
+    @staticmethod
+    def get_output_dim_static(
+        input_dims: int, n_harmonic_functions: int, append_input: bool
+    ) -> int:
+        """
+        Utility to help predict the shape of the output of `forward`.
+
+        Args:
+            input_dims: length of the last dimension of the input tensor
+            n_harmonic_functions: number of embedding frequencies
+            append_input: whether or not to concat the original
+                input to the harmonic embedding
+        Returns:
+            int: the length of the last dimension of the output tensor
+        """
+        return input_dims * (2 * n_harmonic_functions + int(append_input))
+
+    def get_output_dim(self, input_dims: int = 3) -> int:
+        """
+        Same as above. The default for input_dims is 3 for 3D applications
+        which use harmonic embedding for positional encoding,
+        so the input might be xyz.
+        """
+        return self.get_output_dim_static(
+            input_dims, len(self._frequencies), self.append_input
+        )
+
+
+class PoseEmbedding(nn.Module):
+    def __init__(self, target_dim, out_dim, n_harmonic_functions=10, append_input=True):
+        super().__init__()
+
+        self._emb_pose = HarmonicEmbedding(
+            n_harmonic_functions=n_harmonic_functions, append_input=append_input
+        )
+
+        self.out_dim = self._emb_pose.get_output_dim(target_dim)
+
+    def forward(self, pose_encoding):
+        e_pose_encoding = self._emb_pose(pose_encoding)
+        return e_pose_encoding
+
+
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    ret[positive_mask] = torch.sqrt(x[positive_mask])
+    return ret
+
+
+def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
+        matrix.reshape(batch_dim + (9,)), dim=-1
+    )
+
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [
+                1.0 + m00 + m11 + m22,
+                1.0 + m00 - m11 - m22,
+                1.0 - m00 + m11 - m22,
+                1.0 - m00 - m11 + m22,
+            ],
+            dim=-1,
+        )
+    )
+
+    quat_by_rijk = torch.stack(
+        [
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+
+    out = quat_candidates[
+        F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :
+    ].reshape(batch_dim + (4,))
+    return standardize_quaternion(out)
+
+
+def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+
+    Args:
+        quaternions: Quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    quaternions = F.normalize(quaternions, p=2, dim=-1)
+    return torch.where(quaternions[..., 0:1] < 0, -quaternions, quaternions)
+
+
+def camera_to_pose_encoding(
+    camera,
+    pose_encoding_type="absT_quaR",
+):
+    """
+    Inverse to pose_encoding_to_camera
+    camera: opencv, cam2world
+    """
+    if pose_encoding_type == "absT_quaR":
+
+        quaternion_R = matrix_to_quaternion(camera[:, :3, :3])
+
+        pose_encoding = torch.cat([camera[:, :3, 3], quaternion_R], dim=-1)
+    else:
+        raise ValueError(f"Unknown pose encoding {pose_encoding_type}")
+
+    return pose_encoding
+
+
+def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as quaternions to rotation matrices.
+
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+
+def pose_encoding_to_camera(
+    pose_encoding,
+    pose_encoding_type="absT_quaR",
+):
+    """
+    Args:
+        pose_encoding: A tensor of shape `BxC`, containing a batch of
+                        `B` `C`-dimensional pose encodings.
+        pose_encoding_type: The type of pose encoding,
+    """
+
+    if pose_encoding_type == "absT_quaR":
+
+        abs_T = pose_encoding[:, :3]
+        quaternion_R = pose_encoding[:, 3:7]
+        R = quaternion_to_matrix(quaternion_R)
+    else:
+        raise ValueError(f"Unknown pose encoding {pose_encoding_type}")
+
+    c2w_mats = torch.eye(4, 4).to(R.dtype).to(R.device)
+    c2w_mats = c2w_mats[None].repeat(len(R), 1, 1)
+    c2w_mats[:, :3, :3] = R
+    c2w_mats[:, :3, 3] = abs_T
+
+    return c2w_mats
+
+
+def quaternion_conjugate(q):
+    """Compute the conjugate of quaternion q (w, x, y, z)."""
+
+    q_conj = torch.cat([q[..., :1], -q[..., 1:]], dim=-1)
+    return q_conj
+
+
+def quaternion_multiply(q1, q2):
+    """Multiply two quaternions q1 and q2."""
+    w1, x1, y1, z1 = q1.unbind(dim=-1)
+    w2, x2, y2, z2 = q2.unbind(dim=-1)
+
+    w = w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2
+    x = w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2
+    y = w1 * y2 - x1 * z2 + y1 * w2 + z1 * x2
+    z = w1 * z2 + x1 * y2 - y1 * x2 + z1 * w2
+
+    return torch.stack((w, x, y, z), dim=-1)
+
+
+def rotate_vector(q, v):
+    """Rotate vector v by quaternion q."""
+    q_vec = q[..., 1:]
+    q_w = q[..., :1]
+
+    t = 2.0 * torch.cross(q_vec, v, dim=-1)
+    v_rot = v + q_w * t + torch.cross(q_vec, t, dim=-1)
+    return v_rot
+
+
+def relative_pose_absT_quatR(t1, q1, t2, q2):
+    """Compute the relative translation and quaternion between two poses."""
+
+    q1_inv = quaternion_conjugate(q1)
+
+    q_rel = quaternion_multiply(q1_inv, q2)
+
+    delta_t = t2 - t1
+    t_rel = rotate_vector(q1_inv, delta_t)
+    return t_rel, q_rel
diff --git a/extern/CUT3R/src/dust3r/utils/device.py b/extern/CUT3R/src/dust3r/utils/device.py
new file mode 100755
index 0000000000000000000000000000000000000000..ad5e8a44a0e634b4590695063f028847818bf12f
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/utils/device.py
@@ -0,0 +1,88 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+import numpy as np
+import torch
+
+
+def todevice(batch, device, callback=None, non_blocking=False):
+    """Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy).
+
+    batch: list, tuple, dict of tensors or other things
+    device: pytorch device or 'numpy'
+    callback: function that would be called on every sub-elements.
+    """
+    if callback:
+        batch = callback(batch)
+
+    if isinstance(batch, dict):
+        return {k: todevice(v, device) for k, v in batch.items()}
+
+    if isinstance(batch, (tuple, list)):
+        return type(batch)(todevice(x, device) for x in batch)
+
+    x = batch
+    if device == "numpy":
+        if isinstance(x, torch.Tensor):
+            x = x.detach().cpu().numpy()
+    elif x is not None:
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x)
+        if torch.is_tensor(x):
+            x = x.to(device, non_blocking=non_blocking)
+    return x
+
+
+to_device = todevice  # alias
+
+
+def to_numpy(x):
+    return todevice(x, "numpy")
+
+
+def to_cpu(x):
+    return todevice(x, "cpu")
+
+
+def to_cuda(x):
+    return todevice(x, "cuda")
+
+
+def collate_with_cat(whatever, lists=False):
+    if isinstance(whatever, dict):
+        return {k: collate_with_cat(vals, lists=lists) for k, vals in whatever.items()}
+
+    elif isinstance(whatever, (tuple, list)):
+        if len(whatever) == 0:
+            return whatever
+        elem = whatever[0]
+        T = type(whatever)
+
+        if elem is None:
+            return None
+        if isinstance(elem, (bool, float, int, str)):
+            return whatever
+        if isinstance(elem, tuple):
+            return T(collate_with_cat(x, lists=lists) for x in zip(*whatever))
+        if isinstance(elem, dict):
+            return {
+                k: collate_with_cat([e[k] for e in whatever], lists=lists) for k in elem
+            }
+
+        if isinstance(elem, torch.Tensor):
+            return listify(whatever) if lists else torch.cat(whatever)
+        if isinstance(elem, np.ndarray):
+            return (
+                listify(whatever)
+                if lists
+                else torch.cat([torch.from_numpy(x) for x in whatever])
+            )
+
+        return sum(whatever, T())
+
+
+def listify(elems):
+    return [x for e in elems for x in e]
diff --git a/extern/CUT3R/src/dust3r/utils/geometry.py b/extern/CUT3R/src/dust3r/utils/geometry.py
new file mode 100755
index 0000000000000000000000000000000000000000..1c103094978f777e4cf3fa79b2f6cdf7aa4075cd
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/utils/geometry.py
@@ -0,0 +1,555 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+import torch
+import numpy as np
+from scipy.spatial import cKDTree as KDTree
+
+from dust3r.utils.misc import invalid_to_zeros, invalid_to_nans
+from dust3r.utils.device import to_numpy
+
+
+def xy_grid(
+    W,
+    H,
+    device=None,
+    origin=(0, 0),
+    unsqueeze=None,
+    cat_dim=-1,
+    homogeneous=False,
+    **arange_kw,
+):
+    """Output a (H,W,2) array of int32
+    with output[j,i,0] = i + origin[0]
+         output[j,i,1] = j + origin[1]
+    """
+    if device is None:
+
+        arange, meshgrid, stack, ones = np.arange, np.meshgrid, np.stack, np.ones
+    else:
+
+        arange = lambda *a, **kw: torch.arange(*a, device=device, **kw)
+        meshgrid, stack = torch.meshgrid, torch.stack
+        ones = lambda *a: torch.ones(*a, device=device)
+
+    tw, th = [arange(o, o + s, **arange_kw) for s, o in zip((W, H), origin)]
+    grid = meshgrid(tw, th, indexing="xy")
+    if homogeneous:
+        grid = grid + (ones((H, W)),)
+    if unsqueeze is not None:
+        grid = (grid[0].unsqueeze(unsqueeze), grid[1].unsqueeze(unsqueeze))
+    if cat_dim is not None:
+        grid = stack(grid, cat_dim)
+    return grid
+
+
+def geotrf(Trf, pts, ncol=None, norm=False):
+    """Apply a geometric transformation to a list of 3-D points.
+
+    H: 3x3 or 4x4 projection matrix (typically a Homography)
+    p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3)
+
+    ncol: int. number of columns of the result (2 or 3)
+    norm: float. if != 0, the resut is projected on the z=norm plane.
+
+    Returns an array of projected 2d points.
+    """
+    assert Trf.ndim >= 2
+    if isinstance(Trf, np.ndarray):
+        pts = np.asarray(pts)
+    elif isinstance(Trf, torch.Tensor):
+        pts = torch.as_tensor(pts, dtype=Trf.dtype)
+
+    output_reshape = pts.shape[:-1]
+    ncol = ncol or pts.shape[-1]
+
+    if (
+        isinstance(Trf, torch.Tensor)
+        and isinstance(pts, torch.Tensor)
+        and Trf.ndim == 3
+        and pts.ndim == 4
+    ):
+        d = pts.shape[3]
+        if Trf.shape[-1] == d:
+            pts = torch.einsum("bij, bhwj -> bhwi", Trf, pts)
+        elif Trf.shape[-1] == d + 1:
+            pts = (
+                torch.einsum("bij, bhwj -> bhwi", Trf[:, :d, :d], pts)
+                + Trf[:, None, None, :d, d]
+            )
+        else:
+            raise ValueError(f"bad shape, not ending with 3 or 4, for {pts.shape=}")
+    else:
+        if Trf.ndim >= 3:
+            n = Trf.ndim - 2
+            assert Trf.shape[:n] == pts.shape[:n], "batch size does not match"
+            Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1])
+
+            if pts.ndim > Trf.ndim:
+
+                pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1])
+            elif pts.ndim == 2:
+
+                pts = pts[:, None, :]
+
+        if pts.shape[-1] + 1 == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :]
+        elif pts.shape[-1] == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf
+        else:
+            pts = Trf @ pts.T
+            if pts.ndim >= 2:
+                pts = pts.swapaxes(-1, -2)
+
+    if norm:
+        pts = pts / pts[..., -1:]  # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
+        if norm != 1:
+            pts *= norm
+
+    res = pts[..., :ncol].reshape(*output_reshape, ncol)
+    return res
+
+
+def inv(mat):
+    """Invert a torch or numpy matrix"""
+    if isinstance(mat, torch.Tensor):
+        return torch.linalg.inv(mat)
+    if isinstance(mat, np.ndarray):
+        return np.linalg.inv(mat)
+    raise ValueError(f"bad matrix type = {type(mat)}")
+
+
+def depthmap_to_pts3d(depth, pseudo_focal, pp=None, **_):
+    """
+    Args:
+        - depthmap (BxHxW array):
+        - pseudo_focal: [B,H,W] ; [B,2,H,W] or [B,1,H,W]
+    Returns:
+        pointmap of absolute coordinates (BxHxWx3 array)
+    """
+
+    if len(depth.shape) == 4:
+        B, H, W, n = depth.shape
+    else:
+        B, H, W = depth.shape
+        n = None
+
+    if len(pseudo_focal.shape) == 3:  # [B,H,W]
+        pseudo_focalx = pseudo_focaly = pseudo_focal
+    elif len(pseudo_focal.shape) == 4:  # [B,2,H,W] or [B,1,H,W]
+        pseudo_focalx = pseudo_focal[:, 0]
+        if pseudo_focal.shape[1] == 2:
+            pseudo_focaly = pseudo_focal[:, 1]
+        else:
+            pseudo_focaly = pseudo_focalx
+    else:
+        raise NotImplementedError("Error, unknown input focal shape format.")
+
+    assert pseudo_focalx.shape == depth.shape[:3]
+    assert pseudo_focaly.shape == depth.shape[:3]
+    grid_x, grid_y = xy_grid(W, H, cat_dim=0, device=depth.device)[:, None]
+
+    if pp is None:
+        grid_x = grid_x - (W - 1) / 2
+        grid_y = grid_y - (H - 1) / 2
+    else:
+        grid_x = grid_x.expand(B, -1, -1) - pp[:, 0, None, None]
+        grid_y = grid_y.expand(B, -1, -1) - pp[:, 1, None, None]
+
+    if n is None:
+        pts3d = torch.empty((B, H, W, 3), device=depth.device)
+        pts3d[..., 0] = depth * grid_x / pseudo_focalx
+        pts3d[..., 1] = depth * grid_y / pseudo_focaly
+        pts3d[..., 2] = depth
+    else:
+        pts3d = torch.empty((B, H, W, 3, n), device=depth.device)
+        pts3d[..., 0, :] = depth * (grid_x / pseudo_focalx)[..., None]
+        pts3d[..., 1, :] = depth * (grid_y / pseudo_focaly)[..., None]
+        pts3d[..., 2, :] = depth
+    return pts3d
+
+
+def depthmap_to_camera_coordinates(depthmap, camera_intrinsics, pseudo_focal=None):
+    """
+    Args:
+        - depthmap (HxW array):
+        - camera_intrinsics: a 3x3 matrix
+    Returns:
+        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels.
+    """
+    camera_intrinsics = np.float32(camera_intrinsics)
+    H, W = depthmap.shape
+
+    assert camera_intrinsics[0, 1] == 0.0
+    assert camera_intrinsics[1, 0] == 0.0
+    if pseudo_focal is None:
+        fu = camera_intrinsics[0, 0]
+        fv = camera_intrinsics[1, 1]
+    else:
+        assert pseudo_focal.shape == (H, W)
+        fu = fv = pseudo_focal
+    cu = camera_intrinsics[0, 2]
+    cv = camera_intrinsics[1, 2]
+
+    u, v = np.meshgrid(np.arange(W), np.arange(H))
+    z_cam = depthmap
+    x_cam = (u - cu) * z_cam / fu
+    y_cam = (v - cv) * z_cam / fv
+    X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32)
+
+    valid_mask = depthmap > 0.0
+    return X_cam, valid_mask
+
+
+def depthmap_to_absolute_camera_coordinates(
+    depthmap, camera_intrinsics, camera_pose, **kw
+):
+    """
+    Args:
+        - depthmap (HxW array):
+        - camera_intrinsics: a 3x3 matrix
+        - camera_pose: a 4x3 or 4x4 cam2world matrix
+    Returns:
+        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels.
+    """
+    X_cam, valid_mask = depthmap_to_camera_coordinates(depthmap, camera_intrinsics)
+
+    X_world = X_cam  # default
+    if camera_pose is not None:
+
+        R_cam2world = camera_pose[:3, :3]
+        t_cam2world = camera_pose[:3, 3]
+
+        X_world = (
+            np.einsum("ik, vuk -> vui", R_cam2world, X_cam) + t_cam2world[None, None, :]
+        )
+
+    return X_world, valid_mask
+
+
+def colmap_to_opencv_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] -= 0.5
+    K[1, 2] -= 0.5
+    return K
+
+
+def opencv_to_colmap_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] += 0.5
+    K[1, 2] += 0.5
+    return K
+
+
+def normalize_pointcloud(
+    pts1, pts2, norm_mode="avg_dis", valid1=None, valid2=None, ret_factor=False
+):
+    """renorm pointmaps pts1, pts2 with norm_mode"""
+    assert pts1.ndim >= 3 and pts1.shape[-1] == 3
+    assert pts2 is None or (pts2.ndim >= 3 and pts2.shape[-1] == 3)
+    norm_mode, dis_mode = norm_mode.split("_")
+
+    if norm_mode == "avg":
+
+        nan_pts1, nnz1 = invalid_to_zeros(pts1, valid1, ndim=3)
+        nan_pts2, nnz2 = (
+            invalid_to_zeros(pts2, valid2, ndim=3) if pts2 is not None else (None, 0)
+        )
+        all_pts = (
+            torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
+        )
+
+        all_dis = all_pts.norm(dim=-1)
+        if dis_mode == "dis":
+            pass  # do nothing
+        elif dis_mode == "log1p":
+            all_dis = torch.log1p(all_dis)
+        elif dis_mode == "warp-log1p":
+
+            log_dis = torch.log1p(all_dis)
+            warp_factor = log_dis / all_dis.clip(min=1e-8)
+            H1, W1 = pts1.shape[1:-1]
+            pts1 = pts1 * warp_factor[:, : W1 * H1].view(-1, H1, W1, 1)
+            if pts2 is not None:
+                H2, W2 = pts2.shape[1:-1]
+                pts2 = pts2 * warp_factor[:, W1 * H1 :].view(-1, H2, W2, 1)
+            all_dis = log_dis  # this is their true distance afterwards
+        else:
+            raise ValueError(f"bad {dis_mode=}")
+
+        norm_factor = all_dis.sum(dim=1) / (nnz1 + nnz2 + 1e-8)
+    else:
+
+        nan_pts1 = invalid_to_nans(pts1, valid1, ndim=3)
+        nan_pts2 = invalid_to_nans(pts2, valid2, ndim=3) if pts2 is not None else None
+        all_pts = (
+            torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
+        )
+
+        all_dis = all_pts.norm(dim=-1)
+
+        if norm_mode == "avg":
+            norm_factor = all_dis.nanmean(dim=1)
+        elif norm_mode == "median":
+            norm_factor = all_dis.nanmedian(dim=1).values.detach()
+        elif norm_mode == "sqrt":
+            norm_factor = all_dis.sqrt().nanmean(dim=1) ** 2
+        else:
+            raise ValueError(f"bad {norm_mode=}")
+
+    norm_factor = norm_factor.clip(min=1e-8)
+    while norm_factor.ndim < pts1.ndim:
+        norm_factor.unsqueeze_(-1)
+
+    res = pts1 / norm_factor
+    if pts2 is not None:
+        res = (res, pts2 / norm_factor)
+    if ret_factor:
+        res = res + (norm_factor,)
+    return res
+
+
+def normalize_pointcloud_group(
+    pts_list,
+    norm_mode="avg_dis",
+    valid_list=None,
+    conf_list=None,
+    ret_factor=False,
+    ret_factor_only=False,
+):
+    """renorm pointmaps pts1, pts2 with norm_mode"""
+    for pts in pts_list:
+        assert pts.ndim >= 3 and pts.shape[-1] == 3
+
+    norm_mode, dis_mode = norm_mode.split("_")
+
+    if norm_mode == "avg":
+
+        nan_pts_list, nnz_list = zip(
+            *[
+                invalid_to_zeros(pts1, valid1, ndim=3)
+                for pts1, valid1 in zip(pts_list, valid_list)
+            ]
+        )
+        all_pts = torch.cat(nan_pts_list, dim=1)
+        if conf_list is not None:
+            nan_conf_list = [
+                invalid_to_zeros(conf1[..., None], valid1, ndim=3)[0]
+                for conf1, valid1 in zip(conf_list, valid_list)
+            ]
+            all_conf = torch.cat(nan_conf_list, dim=1)[..., 0]
+        else:
+            all_conf = torch.ones_like(all_pts[..., 0])
+
+        all_dis = all_pts.norm(dim=-1)
+        if dis_mode == "dis":
+            pass  # do nothing
+        elif dis_mode == "log1p":
+            all_dis = torch.log1p(all_dis)
+        elif dis_mode == "warp-log1p":
+
+            log_dis = torch.log1p(all_dis)
+            warp_factor = log_dis / all_dis.clip(min=1e-8)
+            H_W_list = [pts.shape[1:-1] for pts in pts_list]
+            pts_list = [
+                pts
+                * warp_factor[:, sum(H_W_list[:i]) : sum(H_W_list[: i + 1])].view(
+                    -1, H, W, 1
+                )
+                for i, (pts, (H, W)) in enumerate(zip(pts_list, H_W_list))
+            ]
+            all_dis = log_dis  # this is their true distance afterwards
+        else:
+            raise ValueError(f"bad {dis_mode=}")
+
+        norm_factor = (all_conf * all_dis).sum(dim=1) / (all_conf.sum(dim=1) + 1e-8)
+    else:
+
+        nan_pts_list = [
+            invalid_to_nans(pts1, valid1, ndim=3)
+            for pts1, valid1 in zip(pts_list, valid_list)
+        ]
+
+        all_pts = torch.cat(nan_pts_list, dim=1)
+
+        all_dis = all_pts.norm(dim=-1)
+
+        if norm_mode == "avg":
+            norm_factor = all_dis.nanmean(dim=1)
+        elif norm_mode == "median":
+            norm_factor = all_dis.nanmedian(dim=1).values.detach()
+        elif norm_mode == "sqrt":
+            norm_factor = all_dis.sqrt().nanmean(dim=1) ** 2
+        else:
+            raise ValueError(f"bad {norm_mode=}")
+
+    norm_factor = norm_factor.clip(min=1e-8)
+    while norm_factor.ndim < pts_list[0].ndim:
+        norm_factor.unsqueeze_(-1)
+
+    if ret_factor_only:
+
+        return norm_factor
+
+    res = [pts / norm_factor for pts in pts_list]
+    if ret_factor:
+        return res, norm_factor
+    return res
+
+
+@torch.no_grad()
+def get_joint_pointcloud_depth(z1, z2, valid_mask1, valid_mask2=None, quantile=0.5):
+
+    _z1 = invalid_to_nans(z1, valid_mask1).reshape(len(z1), -1)
+    _z2 = (
+        invalid_to_nans(z2, valid_mask2).reshape(len(z2), -1)
+        if z2 is not None
+        else None
+    )
+    _z = torch.cat((_z1, _z2), dim=-1) if z2 is not None else _z1
+
+    if quantile == 0.5:
+        shift_z = torch.nanmedian(_z, dim=-1).values
+    else:
+        shift_z = torch.nanquantile(_z, quantile, dim=-1)
+    return shift_z  # (B,)
+
+
+@torch.no_grad()
+def get_group_pointcloud_depth(zs, valid_masks, quantile=0.5):
+
+    _zs = [
+        invalid_to_nans(z1, valid_mask1).reshape(len(z1), -1)
+        for z1, valid_mask1 in zip(zs, valid_masks)
+    ]
+    _z = torch.cat(_zs, dim=-1)
+
+    if quantile == 0.5:
+        shift_z = torch.nanmedian(_z, dim=-1).values
+    else:
+        shift_z = torch.nanquantile(_z, quantile, dim=-1)
+    return shift_z  # (B,)
+
+
+@torch.no_grad()
+def get_joint_pointcloud_center_scale(
+    pts1, pts2, valid_mask1=None, valid_mask2=None, z_only=False, center=True
+):
+
+    _pts1 = invalid_to_nans(pts1, valid_mask1).reshape(len(pts1), -1, 3)
+    _pts2 = (
+        invalid_to_nans(pts2, valid_mask2).reshape(len(pts2), -1, 3)
+        if pts2 is not None
+        else None
+    )
+    _pts = torch.cat((_pts1, _pts2), dim=1) if pts2 is not None else _pts1
+
+    _center = torch.nanmedian(_pts, dim=1, keepdim=True).values  # (B,1,3)
+    if z_only:
+        _center[..., :2] = 0  # do not center X and Y
+
+    _norm = ((_pts - _center) if center else _pts).norm(dim=-1)
+    scale = torch.nanmedian(_norm, dim=1).values
+    return _center[:, None, :, :], scale[:, None, None, None]
+
+
+@torch.no_grad()
+def get_group_pointcloud_center_scale(pts, valid_masks=None, z_only=False, center=True):
+
+    _pts = [
+        invalid_to_nans(pts1, valid_mask1).reshape(len(pts1), -1, 3)
+        for pts1, valid_mask1 in zip(pts, valid_masks)
+    ]
+    _pts = torch.cat(_pts, dim=1)
+
+    _center = torch.nanmedian(_pts, dim=1, keepdim=True).values  # (B,1,3)
+    if z_only:
+        _center[..., :2] = 0  # do not center X and Y
+
+    _norm = ((_pts - _center) if center else _pts).norm(dim=-1)
+    scale = torch.nanmedian(_norm, dim=1).values
+    return _center[:, None, :, :], scale[:, None, None, None]
+
+
+def find_reciprocal_matches(P1, P2):
+    """
+    returns 3 values:
+    1 - reciprocal_in_P2: a boolean array of size P2.shape[0], a "True" value indicates a match
+    2 - nn2_in_P1: a int array of size P2.shape[0], it contains the indexes of the closest points in P1
+    3 - reciprocal_in_P2.sum(): the number of matches
+    """
+    tree1 = KDTree(P1)
+    tree2 = KDTree(P2)
+
+    _, nn1_in_P2 = tree2.query(P1, workers=8)
+    _, nn2_in_P1 = tree1.query(P2, workers=8)
+
+    reciprocal_in_P1 = nn2_in_P1[nn1_in_P2] == np.arange(len(nn1_in_P2))
+    reciprocal_in_P2 = nn1_in_P2[nn2_in_P1] == np.arange(len(nn2_in_P1))
+    assert reciprocal_in_P1.sum() == reciprocal_in_P2.sum()
+    return reciprocal_in_P2, nn2_in_P1, reciprocal_in_P2.sum()
+
+
+def get_med_dist_between_poses(poses):
+    from scipy.spatial.distance import pdist
+
+    return np.median(pdist([to_numpy(p[:3, 3]) for p in poses]))
+
+
+def weighted_procrustes(A, B, w, use_weights=True, eps=1e-16, return_T=False):
+    """
+    X: torch tensor B x N x 3
+    Y: torch tensor B x N x 3
+    w: torch tensor B x N
+    """
+    assert len(A) == len(B)
+    if use_weights:
+        W1 = torch.abs(w).sum(1, keepdim=True)
+        w_norm = (w / (W1 + eps)).unsqueeze(-1)
+        a_mean = (w_norm * A).sum(dim=1, keepdim=True)
+        b_mean = (w_norm * B).sum(dim=1, keepdim=True)
+
+        A_c = A - a_mean
+        B_c = B - b_mean
+
+        H = torch.einsum("bni,bnj->bij", A_c, w_norm * B_c)
+
+    else:
+        a_mean = A.mean(axis=1, keepdim=True)
+        b_mean = B.mean(axis=1, keepdim=True)
+
+        A_c = A - a_mean
+        B_c = B - b_mean
+
+        H = torch.einsum("bij,bik->bjk", A_c, B_c)
+
+    U, S, V = torch.svd(H)  # U: B x 3 x 3, S: B x 3, V: B x 3 x 3
+    Z = torch.eye(3).unsqueeze(0).repeat(A.shape[0], 1, 1).to(A.device)
+    Z[:, -1, -1] = torch.sign(torch.linalg.det(U @ V.transpose(1, 2)))  # B x 3 x 3
+    R = V @ Z @ U.transpose(1, 2)  # B x 3 x 3
+    t = b_mean - torch.einsum("bij,bjk->bik", R, a_mean.transpose(-2, -1)).transpose(
+        -2, -1
+    )
+    if return_T:
+        T = torch.eye(4).unsqueeze(0).repeat(A.shape[0], 1, 1).to(A.device)
+        T[:, :3, :3] = R
+        T[:, :3, 3] = t.squeeze()
+        return T
+    return R, t.squeeze()
diff --git a/extern/CUT3R/src/dust3r/utils/image.py b/extern/CUT3R/src/dust3r/utils/image.py
new file mode 100755
index 0000000000000000000000000000000000000000..8eb0f88778f290317d2c08555573d0bf675df25c
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/utils/image.py
@@ -0,0 +1,263 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+import os
+import torch
+import numpy as np
+import PIL.Image
+from PIL.ImageOps import exif_transpose
+import torchvision.transforms as tvf
+
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2  # noqa
+
+try:
+    from pillow_heif import register_heif_opener  # noqa
+
+    register_heif_opener()
+    heif_support_enabled = True
+except ImportError:
+    heif_support_enabled = False
+
+ImgNorm = tvf.Compose([tvf.ToTensor(), tvf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+
+def img_to_arr(img):
+    if isinstance(img, str):
+        img = imread_cv2(img)
+    return img
+
+
+def imread_cv2(path, options=cv2.IMREAD_COLOR):
+    """Open an image or a depthmap with opencv-python."""
+    if path.endswith((".exr", "EXR")):
+        options = cv2.IMREAD_ANYDEPTH
+    img = cv2.imread(path, options)
+    if img is None:
+        raise IOError(f"Could not load image={path} with {options=}")
+    if img.ndim == 3:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    return img
+
+
+def rgb(ftensor, true_shape=None):
+    if isinstance(ftensor, list):
+        return [rgb(x, true_shape=true_shape) for x in ftensor]
+    if isinstance(ftensor, torch.Tensor):
+        ftensor = ftensor.detach().cpu().numpy()  # H,W,3
+    if ftensor.ndim == 3 and ftensor.shape[0] == 3:
+        ftensor = ftensor.transpose(1, 2, 0)
+    elif ftensor.ndim == 4 and ftensor.shape[1] == 3:
+        ftensor = ftensor.transpose(0, 2, 3, 1)
+    if true_shape is not None:
+        H, W = true_shape
+        ftensor = ftensor[:H, :W]
+    if ftensor.dtype == np.uint8:
+        img = np.float32(ftensor) / 255
+    else:
+        img = (ftensor * 0.5) + 0.5
+    return img.clip(min=0, max=1)
+
+
+def _resize_pil_image(img, long_edge_size):
+    S = max(img.size)
+    if S > long_edge_size:
+        interp = PIL.Image.LANCZOS
+    elif S <= long_edge_size:
+        interp = PIL.Image.BICUBIC
+    new_size = tuple(int(round(x * long_edge_size / S)) for x in img.size)
+    return img.resize(new_size, interp)
+
+
+def load_images(folder_or_list, size, square_ok=False, verbose=True):
+    """open and convert all images in a list or folder to proper input format for DUSt3R"""
+    if isinstance(folder_or_list, str):
+        if verbose:
+            print(f">> Loading images from {folder_or_list}")
+        root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list))
+
+    elif isinstance(folder_or_list, list):
+        if verbose:
+            print(f">> Loading a list of {len(folder_or_list)} images")
+        root, folder_content = "", folder_or_list
+
+    else:
+        raise ValueError(f"bad {folder_or_list=} ({type(folder_or_list)})")
+
+    supported_images_extensions = [".jpg", ".jpeg", ".png", ".bmp"]
+    if heif_support_enabled:
+        supported_images_extensions += [".heic", ".heif"]
+    supported_images_extensions = tuple(supported_images_extensions)
+
+    imgs = []
+    for path in folder_content:
+        if not path.lower().endswith(supported_images_extensions):
+            continue
+        img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert("RGB")
+        W1, H1 = img.size
+        if size == 224:
+
+            img = _resize_pil_image(img, round(size * max(W1 / H1, H1 / W1)))
+        else:
+
+            img = _resize_pil_image(img, size)
+        W, H = img.size
+        cx, cy = W // 2, H // 2
+        if size == 224:
+            half = min(cx, cy)
+            img = img.crop((cx - half, cy - half, cx + half, cy + half))
+        else:
+            halfw, halfh = ((2 * cx) // 16) * 8, ((2 * cy) // 16) * 8
+            if not (square_ok) and W == H:
+                halfh = 3 * halfw / 4
+            img = img.crop((cx - halfw, cy - halfh, cx + halfw, cy + halfh))
+
+        W2, H2 = img.size
+        if verbose:
+            print(f" - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}")
+        imgs.append(
+            dict(
+                img=ImgNorm(img)[None],
+                true_shape=np.int32([img.size[::-1]]),
+                idx=len(imgs),
+                instance=str(len(imgs)),
+            )
+        )
+
+    assert imgs, "no images foud at " + root
+    if verbose:
+        print(f" (Found {len(imgs)} images)")
+    return imgs
+
+
+def load_images_for_eval(
+    folder_or_list, size, square_ok=False, verbose=True, crop=True
+):
+    """open and convert all images in a list or folder to proper input format for DUSt3R"""
+    if isinstance(folder_or_list, str):
+        if verbose:
+            print(f">> Loading images from {folder_or_list}")
+        root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list))
+
+    elif isinstance(folder_or_list, list):
+        if verbose:
+            print(f">> Loading a list of {len(folder_or_list)} images")
+        root, folder_content = "", folder_or_list
+
+    else:
+        raise ValueError(f"bad {folder_or_list=} ({type(folder_or_list)})")
+
+    supported_images_extensions = [".jpg", ".jpeg", ".png"]
+    if heif_support_enabled:
+        supported_images_extensions += [".heic", ".heif"]
+    supported_images_extensions = tuple(supported_images_extensions)
+
+    imgs = []
+    for path in folder_content:
+        if not path.lower().endswith(supported_images_extensions):
+            continue
+        img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert("RGB")
+        W1, H1 = img.size
+        if size == 224:
+            # resize short side to 224 (then crop)
+            img = _resize_pil_image(img, round(size * max(W1 / H1, H1 / W1)))
+        else:
+            # resize long side to 512
+            img = _resize_pil_image(img, size)
+        W, H = img.size
+        cx, cy = W // 2, H // 2
+        if size == 224:
+            half = min(cx, cy)
+            if crop:
+                img = img.crop((cx - half, cy - half, cx + half, cy + half))
+            else:  # resize
+                img = img.resize((2 * half, 2 * half), PIL.Image.LANCZOS)
+        else:
+            halfw, halfh = ((2 * cx) // 16) * 8, ((2 * cy) // 16) * 8
+            if not (square_ok) and W == H:
+                halfh = 3 * halfw / 4
+            if crop:
+                img = img.crop((cx - halfw, cy - halfh, cx + halfw, cy + halfh))
+            else:  # resize
+                img = img.resize((2 * halfw, 2 * halfh), PIL.Image.LANCZOS)
+        W2, H2 = img.size
+        if verbose:
+            print(f" - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}")
+        imgs.append(
+            dict(
+                img=ImgNorm(img)[None],
+                true_shape=np.int32([img.size[::-1]]),
+                idx=len(imgs),
+                instance=str(len(imgs)),
+            )
+        )
+
+    assert imgs, "no images foud at " + root
+    if verbose:
+        print(f" (Found {len(imgs)} images)")
+    return imgs
+
+
+def load_images_512(folder_or_list, size, square_ok=False, verbose=True):
+    """open and convert all images in a list or folder to proper input format for DUSt3R"""
+    if isinstance(folder_or_list, str):
+        if verbose:
+            print(f">> Loading images from {folder_or_list}")
+        root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list))
+
+    elif isinstance(folder_or_list, list):
+        if verbose:
+            print(f">> Loading a list of {len(folder_or_list)} images")
+        root, folder_content = "", folder_or_list
+
+    else:
+        raise ValueError(f"bad {folder_or_list=} ({type(folder_or_list)})")
+
+    supported_images_extensions = [".jpg", ".jpeg", ".png", ".bmp"]
+    if heif_support_enabled:
+        supported_images_extensions += [".heic", ".heif"]
+    supported_images_extensions = tuple(supported_images_extensions)
+
+    imgs = []
+    for path in folder_content:
+        if not path.lower().endswith(supported_images_extensions):
+            continue
+        img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert("RGB")
+        img = img.resize((512, 384))
+        W1, H1 = img.size
+        if size == 224:
+
+            img = _resize_pil_image(img, round(size * max(W1 / H1, H1 / W1)))
+        else:
+
+            img = _resize_pil_image(img, size)
+        W, H = img.size
+        cx, cy = W // 2, H // 2
+        if size == 224:
+            half = min(cx, cy)
+            img = img.crop((cx - half, cy - half, cx + half, cy + half))
+        else:
+            halfw, halfh = ((2 * cx) // 16) * 8, ((2 * cy) // 16) * 8
+            if not (square_ok) and W == H:
+                halfh = 3 * halfw / 4
+            img = img.crop((cx - halfw, cy - halfh, cx + halfw, cy + halfh))
+
+        W2, H2 = img.size
+        if verbose:
+            print(f" - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}")
+        imgs.append(
+            dict(
+                img=ImgNorm(img)[None],
+                true_shape=np.int32([img.size[::-1]]),
+                idx=len(imgs),
+                instance=str(len(imgs)),
+            )
+        )
+
+    assert imgs, "no images foud at " + root
+    if verbose:
+        print(f" (Found {len(imgs)} images)")
+    return imgs
diff --git a/extern/CUT3R/src/dust3r/utils/misc.py b/extern/CUT3R/src/dust3r/utils/misc.py
new file mode 100755
index 0000000000000000000000000000000000000000..fbb3f225ba3b0a007541eb81362cd58e1c54d916
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/utils/misc.py
@@ -0,0 +1,127 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+import torch
+
+
+def fill_default_args(kwargs, func):
+    import inspect  # a bit hacky but it works reliably
+
+    signature = inspect.signature(func)
+
+    for k, v in signature.parameters.items():
+        if v.default is inspect.Parameter.empty:
+            continue
+        kwargs.setdefault(k, v.default)
+
+    return kwargs
+
+
+def freeze_all_params(modules):
+    for module in modules:
+        try:
+            for n, param in module.named_parameters():
+                param.requires_grad = False
+        except AttributeError:
+
+            module.requires_grad = False
+
+
+def is_symmetrized(gt1, gt2):
+    x = gt1["instance"]
+    y = gt2["instance"]
+    if len(x) == len(y) and len(x) == 1:
+        return False  # special case of batchsize 1
+    ok = True
+    for i in range(0, len(x), 2):
+        ok = ok and (x[i] == y[i + 1]) and (x[i + 1] == y[i])
+    return ok
+
+
+def flip(tensor):
+    """flip so that tensor[0::2] <=> tensor[1::2]"""
+    return torch.stack((tensor[1::2], tensor[0::2]), dim=1).flatten(0, 1)
+
+
+def interleave(tensor1, tensor2):
+    res1 = torch.stack((tensor1, tensor2), dim=1).flatten(0, 1)
+    res2 = torch.stack((tensor2, tensor1), dim=1).flatten(0, 1)
+    return res1, res2
+
+
+def transpose_to_landscape(head, activate=True):
+    """Predict in the correct aspect-ratio,
+    then transpose the result in landscape
+    and stack everything back together.
+    """
+
+    def wrapper_no(decout, true_shape, **kwargs):
+        B = len(true_shape)
+        assert true_shape[0:1].allclose(true_shape), "true_shape must be all identical"
+        H, W = true_shape[0].cpu().tolist()
+        res = head(decout, (H, W), **kwargs)
+        return res
+
+    def wrapper_yes(decout, true_shape, **kwargs):
+        B = len(true_shape)
+
+        H, W = int(true_shape.min()), int(true_shape.max())
+
+        height, width = true_shape.T
+        is_landscape = width >= height
+        is_portrait = ~is_landscape
+
+        if is_landscape.all():
+            return head(decout, (H, W), **kwargs)
+        if is_portrait.all():
+            return transposed(head(decout, (W, H), **kwargs))
+
+        def selout(ar):
+            return [d[ar] for d in decout]
+
+        if "pos" in kwargs:
+            kwargs_landscape = kwargs.copy()
+            kwargs_landscape["pos"] = kwargs["pos"][is_landscape]
+            kwargs_portrait = kwargs.copy()
+            kwargs_portrait["pos"] = kwargs["pos"][is_portrait]
+        l_result = head(selout(is_landscape), (H, W), **kwargs_landscape)
+        p_result = transposed(head(selout(is_portrait), (W, H), **kwargs_portrait))
+
+        result = {}
+        for k in l_result | p_result:
+            x = l_result[k].new(B, *l_result[k].shape[1:])
+            x[is_landscape] = l_result[k]
+            x[is_portrait] = p_result[k]
+            result[k] = x
+
+        return result
+
+    return wrapper_yes if activate else wrapper_no
+
+
+def transposed(dic):
+    return {k: v.swapaxes(1, 2) if v.ndim > 2 else v for k, v in dic.items()}
+
+
+def invalid_to_nans(arr, valid_mask, ndim=999):
+    if valid_mask is not None:
+        arr = arr.clone()
+        arr[~valid_mask] = float("nan")
+    if arr.ndim > ndim:
+        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
+    return arr
+
+
+def invalid_to_zeros(arr, valid_mask, ndim=999):
+    if valid_mask is not None:
+        arr = arr.clone()
+        arr[~valid_mask] = 0
+        nnz = valid_mask.view(len(valid_mask), -1).sum(1)
+    else:
+        nnz = arr.numel() // len(arr) if len(arr) else 0  # number of point per image
+    if arr.ndim > ndim:
+        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
+    return arr, nnz
diff --git a/extern/CUT3R/src/dust3r/utils/parallel.py b/extern/CUT3R/src/dust3r/utils/parallel.py
new file mode 100755
index 0000000000000000000000000000000000000000..5082a85b8c66cdcddc7402c401c0c983c5f1078b
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/utils/parallel.py
@@ -0,0 +1,87 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+from tqdm import tqdm
+from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing import cpu_count
+
+
+def parallel_threads(
+    function,
+    args,
+    workers=0,
+    star_args=False,
+    kw_args=False,
+    front_num=1,
+    Pool=ThreadPool,
+    **tqdm_kw
+):
+    """tqdm but with parallel execution.
+
+    Will essentially return
+      res = [ function(arg) # default
+              function(*arg) # if star_args is True
+              function(**arg) # if kw_args is True
+              for arg in args]
+
+    Note:
+        the <front_num> first elements of args will not be parallelized.
+        This can be useful for debugging.
+    """
+    while workers <= 0:
+        workers += cpu_count()
+    if workers == 1:
+        front_num = float("inf")
+
+    try:
+        n_args_parallel = len(args) - front_num
+    except TypeError:
+        n_args_parallel = None
+    args = iter(args)
+
+    front = []
+    while len(front) < front_num:
+        try:
+            a = next(args)
+        except StopIteration:
+            return front  # end of the iterable
+        front.append(
+            function(*a) if star_args else function(**a) if kw_args else function(a)
+        )
+
+    out = []
+    with Pool(workers) as pool:
+
+        if star_args:
+            futures = pool.imap(starcall, [(function, a) for a in args])
+        elif kw_args:
+            futures = pool.imap(starstarcall, [(function, a) for a in args])
+        else:
+            futures = pool.imap(function, args)
+
+        for f in tqdm(futures, total=n_args_parallel, **tqdm_kw):
+            out.append(f)
+    return front + out
+
+
+def parallel_processes(*args, **kwargs):
+    """Same as parallel_threads, with processes"""
+    import multiprocessing as mp
+
+    kwargs["Pool"] = mp.Pool
+    return parallel_threads(*args, **kwargs)
+
+
+def starcall(args):
+    """convenient wrapper for Process.Pool"""
+    function, args = args
+    return function(*args)
+
+
+def starstarcall(args):
+    """convenient wrapper for Process.Pool"""
+    function, args = args
+    return function(**args)
diff --git a/extern/CUT3R/src/dust3r/utils/path_to_croco.py b/extern/CUT3R/src/dust3r/utils/path_to_croco.py
new file mode 100755
index 0000000000000000000000000000000000000000..7e7ce2d9ffbe8a89a0ddc81e0c1c81f571608fc9
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/utils/path_to_croco.py
@@ -0,0 +1,21 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+import sys
+import os.path as path
+
+HERE_PATH = path.normpath(path.dirname(__file__))
+CROCO_REPO_PATH = path.normpath(path.join(HERE_PATH, "../../croco"))
+CROCO_MODELS_PATH = path.join(CROCO_REPO_PATH, "models")
+
+if path.isdir(CROCO_MODELS_PATH):
+
+    sys.path.insert(0, CROCO_REPO_PATH)
+else:
+    raise ImportError(
+        f"croco is not initialized, could not find: {CROCO_MODELS_PATH}.\n "
+        "Did you forget to run 'git submodule update --init --recursive' ?"
+    )
diff --git a/extern/CUT3R/src/dust3r/utils/render.py b/extern/CUT3R/src/dust3r/utils/render.py
new file mode 100644
index 0000000000000000000000000000000000000000..95caf27b5c5d58287f391f1e901e816ff20939e7
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/utils/render.py
@@ -0,0 +1,75 @@
+import torch
+from gsplat import rasterization
+from dust3r.utils.geometry import inv, geotrf
+
+
+def render(
+    intrinsics: torch.Tensor,
+    pts3d: torch.Tensor,
+    rgbs: torch.Tensor | None = None,
+    scale: float = 0.002,
+    opacity: float = 0.95,
+):
+
+    device = pts3d.device
+    batch_size = len(intrinsics)
+    img_size = pts3d.shape[1:3]
+    pts3d = pts3d.reshape(batch_size, -1, 3)
+    num_pts = pts3d.shape[1]
+    quats = torch.randn((num_pts, 4), device=device)
+    quats = quats / quats.norm(dim=-1, keepdim=True)
+    scales = scale * torch.ones((num_pts, 3), device=device)
+    opacities = opacity * torch.ones((num_pts), device=device)
+    if rgbs is not None:
+        assert rgbs.shape[1] == 3
+        rgbs = rgbs.reshape(batch_size, 3, -1).transpose(1, 2)
+    else:
+        rgbs = torch.ones_like(pts3d[:, :, :3])
+
+    rendered_rgbs = []
+    rendered_depths = []
+    accs = []
+    for i in range(batch_size):
+        rgbd, acc, _ = rasterization(
+            pts3d[i],
+            quats,
+            scales,
+            opacities,
+            rgbs[i],
+            torch.eye(4, device=device)[None],
+            intrinsics[[i]],
+            width=img_size[1],
+            height=img_size[0],
+            packed=False,
+            render_mode="RGB+D",
+        )
+
+        rendered_depths.append(rgbd[..., 3])
+
+    rendered_depths = torch.cat(rendered_depths, dim=0)
+
+    return rendered_rgbs, rendered_depths, accs
+
+
+def get_render_results(gts, preds, self_view=False):
+    device = preds[0]["pts3d_in_self_view"].device
+    with torch.no_grad():
+        depths = []
+        gt_depths = []
+        for i, (gt, pred) in enumerate(zip(gts, preds)):
+            if self_view:
+                camera = inv(gt["camera_pose"]).to(device)
+                intrinsics = gt["camera_intrinsics"].to(device)
+                pred = pred["pts3d_in_self_view"]
+            else:
+                camera = inv(gts[0]["camera_pose"]).to(device)
+                intrinsics = gts[0]["camera_intrinsics"].to(device)
+                pred = pred["pts3d_in_other_view"]
+            gt_img = gt["img"].to(device)
+            gt_pts3d = gt["pts3d"].to(device)
+
+            _, depth, _ = render(intrinsics, pred, gt_img)
+            _, gt_depth, _ = render(intrinsics, geotrf(camera, gt_pts3d), gt_img)
+            depths.append(depth)
+            gt_depths.append(gt_depth)
+    return depths, gt_depths
diff --git a/extern/CUT3R/src/dust3r/viz.py b/extern/CUT3R/src/dust3r/viz.py
new file mode 100755
index 0000000000000000000000000000000000000000..f25aa80cca6226d34d9f6002bc927115d0e608ed
--- /dev/null
+++ b/extern/CUT3R/src/dust3r/viz.py
@@ -0,0 +1,1089 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+
+import PIL.Image
+import numpy as np
+from scipy.spatial.transform import Rotation
+import torch
+import cv2
+import matplotlib as mpl
+import matplotlib.cm as cm
+import matplotlib.pyplot as plt
+from dust3r.utils.geometry import (
+    geotrf,
+    get_med_dist_between_poses,
+    depthmap_to_absolute_camera_coordinates,
+)
+from dust3r.utils.device import to_numpy
+from dust3r.utils.image import rgb, img_to_arr
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from matplotlib.figure import Figure
+
+try:
+    import trimesh
+except ImportError:
+    print("/!\\ module trimesh is not installed, cannot visualize results /!\\")
+
+
+def float2uint8(x):
+    return (255.0 * x).astype(np.uint8)
+
+
+def uint82float(img):
+    return np.ascontiguousarray(img) / 255.0
+
+
+def cat_3d(vecs):
+    if isinstance(vecs, (np.ndarray, torch.Tensor)):
+        vecs = [vecs]
+    return np.concatenate([p.reshape(-1, 3) for p in to_numpy(vecs)])
+
+
+def show_raw_pointcloud(pts3d, colors, point_size=2):
+    scene = trimesh.Scene()
+
+    pct = trimesh.PointCloud(cat_3d(pts3d), colors=cat_3d(colors))
+    scene.add_geometry(pct)
+
+    scene.show(line_settings={"point_size": point_size})
+
+
+def pts3d_to_trimesh(img, pts3d, valid=None):
+    H, W, THREE = img.shape
+    assert THREE == 3
+    assert img.shape == pts3d.shape
+
+    vertices = pts3d.reshape(-1, 3)
+
+    idx = np.arange(len(vertices)).reshape(H, W)
+    idx1 = idx[:-1, :-1].ravel()  # top-left corner
+    idx2 = idx[:-1, +1:].ravel()  # right-left corner
+    idx3 = idx[+1:, :-1].ravel()  # bottom-left corner
+    idx4 = idx[+1:, +1:].ravel()  # bottom-right corner
+    faces = np.concatenate(
+        (
+            np.c_[idx1, idx2, idx3],
+            np.c_[
+                idx3, idx2, idx1
+            ],  # same triangle, but backward (cheap solution to cancel face culling)
+            np.c_[idx2, idx3, idx4],
+            np.c_[
+                idx4, idx3, idx2
+            ],  # same triangle, but backward (cheap solution to cancel face culling)
+        ),
+        axis=0,
+    )
+
+    face_colors = np.concatenate(
+        (
+            img[:-1, :-1].reshape(-1, 3),
+            img[:-1, :-1].reshape(-1, 3),
+            img[+1:, +1:].reshape(-1, 3),
+            img[+1:, +1:].reshape(-1, 3),
+        ),
+        axis=0,
+    )
+
+    if valid is not None:
+        assert valid.shape == (H, W)
+        valid_idxs = valid.ravel()
+        valid_faces = valid_idxs[faces].all(axis=-1)
+        faces = faces[valid_faces]
+        face_colors = face_colors[valid_faces]
+
+    assert len(faces) == len(face_colors)
+    return dict(vertices=vertices, face_colors=face_colors, faces=faces)
+
+
+def cat_meshes(meshes):
+    vertices, faces, colors = zip(
+        *[(m["vertices"], m["faces"], m["face_colors"]) for m in meshes]
+    )
+    n_vertices = np.cumsum([0] + [len(v) for v in vertices])
+    for i in range(len(faces)):
+        faces[i][:] += n_vertices[i]
+
+    vertices = np.concatenate(vertices)
+    colors = np.concatenate(colors)
+    faces = np.concatenate(faces)
+    return dict(vertices=vertices, face_colors=colors, faces=faces)
+
+
+def show_duster_pairs(view1, view2, pred1, pred2):
+    import matplotlib.pyplot as pl
+
+    pl.ion()
+
+    for e in range(len(view1["instance"])):
+        i = view1["idx"][e]
+        j = view2["idx"][e]
+        img1 = rgb(view1["img"][e])
+        img2 = rgb(view2["img"][e])
+        conf1 = pred1["conf"][e].squeeze()
+        conf2 = pred2["conf"][e].squeeze()
+        score = conf1.mean() * conf2.mean()
+        print(f">> Showing pair #{e} {i}-{j} {score=:g}")
+        pl.clf()
+        pl.subplot(221).imshow(img1)
+        pl.subplot(223).imshow(img2)
+        pl.subplot(222).imshow(conf1, vmin=1, vmax=30)
+        pl.subplot(224).imshow(conf2, vmin=1, vmax=30)
+        pts1 = pred1["pts3d"][e]
+        pts2 = pred2["pts3d_in_other_view"][e]
+        pl.subplots_adjust(0, 0, 1, 1, 0, 0)
+        if input("show pointcloud? (y/n) ") == "y":
+            show_raw_pointcloud(cat(pts1, pts2), cat(img1, img2), point_size=5)
+
+
+def auto_cam_size(im_poses):
+    return 0.1 * get_med_dist_between_poses(im_poses)
+
+
+class SceneViz:
+    def __init__(self):
+        self.scene = trimesh.Scene()
+
+    def add_rgbd(
+        self, image, depth, intrinsics=None, cam2world=None, zfar=np.inf, mask=None
+    ):
+        image = img_to_arr(image)
+
+        if intrinsics is None:
+            H, W, THREE = image.shape
+            focal = max(H, W)
+            intrinsics = np.float32([[focal, 0, W / 2], [0, focal, H / 2], [0, 0, 1]])
+
+        pts3d = depthmap_to_pts3d(depth, intrinsics, cam2world=cam2world)
+
+        return self.add_pointcloud(
+            pts3d, image, mask=(depth < zfar) if mask is None else mask
+        )
+
+    def add_pointcloud(self, pts3d, color=(0, 0, 0), mask=None, denoise=False):
+        pts3d = to_numpy(pts3d)
+        mask = to_numpy(mask)
+        if not isinstance(pts3d, list):
+            pts3d = [pts3d.reshape(-1, 3)]
+            if mask is not None:
+                mask = [mask.ravel()]
+        if not isinstance(color, (tuple, list)):
+            color = [color.reshape(-1, 3)]
+        if mask is None:
+            mask = [slice(None)] * len(pts3d)
+
+        pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+        pct = trimesh.PointCloud(pts)
+
+        if isinstance(color, (list, np.ndarray, torch.Tensor)):
+            color = to_numpy(color)
+            col = np.concatenate([p[m] for p, m in zip(color, mask)])
+            assert col.shape == pts.shape, bb()
+            pct.visual.vertex_colors = uint8(col.reshape(-1, 3))
+        else:
+            assert len(color) == 3
+            pct.visual.vertex_colors = np.broadcast_to(uint8(color), pts.shape)
+
+        if denoise:
+
+            centroid = np.median(pct.vertices, axis=0)
+            dist_to_centroid = np.linalg.norm(pct.vertices - centroid, axis=-1)
+            dist_thr = np.quantile(dist_to_centroid, 0.99)
+            valid = dist_to_centroid < dist_thr
+
+            pct = trimesh.PointCloud(
+                pct.vertices[valid], color=pct.visual.vertex_colors[valid]
+            )
+
+        self.scene.add_geometry(pct)
+        return self
+
+    def add_rgbd(
+        self, image, depth, intrinsics=None, cam2world=None, zfar=np.inf, mask=None
+    ):
+
+        if intrinsics is None:
+            H, W, THREE = image.shape
+            focal = max(H, W)
+            intrinsics = np.float32([[focal, 0, W / 2], [0, focal, H / 2], [0, 0, 1]])
+
+        pts3d, mask2 = depthmap_to_absolute_camera_coordinates(
+            depth, intrinsics, cam2world
+        )
+        mask2 &= depth < zfar
+
+        if mask is not None:
+            mask2 &= mask
+
+        return self.add_pointcloud(pts3d, image, mask=mask2)
+
+    def add_camera(
+        self,
+        pose_c2w,
+        focal=None,
+        color=(0, 0, 0),
+        image=None,
+        imsize=None,
+        cam_size=0.03,
+    ):
+        pose_c2w, focal, color, image = to_numpy((pose_c2w, focal, color, image))
+        image = img_to_arr(image)
+        if isinstance(focal, np.ndarray) and focal.shape == (3, 3):
+            intrinsics = focal
+            focal = (intrinsics[0, 0] * intrinsics[1, 1]) ** 0.5
+            if imsize is None:
+                imsize = (2 * intrinsics[0, 2], 2 * intrinsics[1, 2])
+
+        add_scene_cam(
+            self.scene,
+            pose_c2w,
+            color,
+            image,
+            focal,
+            imsize=imsize,
+            screen_width=cam_size,
+            marker=None,
+        )
+        return self
+
+    def add_cameras(
+        self, poses, focals=None, images=None, imsizes=None, colors=None, **kw
+    ):
+        get = lambda arr, idx: None if arr is None else arr[idx]
+        for i, pose_c2w in enumerate(poses):
+            self.add_camera(
+                pose_c2w,
+                get(focals, i),
+                image=get(images, i),
+                color=get(colors, i),
+                imsize=get(imsizes, i),
+                **kw,
+            )
+        return self
+
+    def show(self, point_size=2):
+        self.scene.show(line_settings={"point_size": point_size})
+
+
+def show_raw_pointcloud_with_cams(
+    imgs, pts3d, mask, focals, cams2world, point_size=2, cam_size=0.05, cam_color=None
+):
+    """Visualization of a pointcloud with cameras
+    imgs = (N, H, W, 3) or N-size list of [(H,W,3), ...]
+    pts3d = (N, H, W, 3) or N-size list of [(H,W,3), ...]
+    focals = (N,) or N-size list of [focal, ...]
+    cams2world = (N,4,4) or N-size list of [(4,4), ...]
+    """
+    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
+    pts3d = to_numpy(pts3d)
+    imgs = to_numpy(imgs)
+    focals = to_numpy(focals)
+    cams2world = to_numpy(cams2world)
+
+    scene = trimesh.Scene()
+
+    pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+    col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
+    pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
+    scene.add_geometry(pct)
+
+    for i, pose_c2w in enumerate(cams2world):
+        if isinstance(cam_color, list):
+            camera_edge_color = cam_color[i]
+        else:
+            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+        add_scene_cam(
+            scene,
+            pose_c2w,
+            camera_edge_color,
+            imgs[i] if i < len(imgs) else None,
+            focals[i],
+            screen_width=cam_size,
+        )
+
+    scene.show(line_settings={"point_size": point_size})
+
+
+def add_scene_cam(
+    scene,
+    pose_c2w,
+    edge_color,
+    image=None,
+    focal=None,
+    imsize=None,
+    screen_width=0.03,
+    marker=None,
+):
+    if image is not None:
+        image = np.asarray(image)
+        H, W, THREE = image.shape
+        assert THREE == 3
+        if image.dtype != np.uint8:
+            image = np.uint8(255 * image)
+    elif imsize is not None:
+        W, H = imsize
+    elif focal is not None:
+        H = W = focal / 1.1
+    else:
+        H = W = 1
+
+    if isinstance(focal, np.ndarray):
+        focal = focal[0]
+    if not focal:
+        focal = min(H, W) * 1.1  # default value
+
+    height = max(screen_width / 10, focal * screen_width / H)
+    width = screen_width * 0.5**0.5
+    rot45 = np.eye(4)
+    rot45[:3, :3] = Rotation.from_euler("z", np.deg2rad(45)).as_matrix()
+    rot45[2, 3] = -height  # set the tip of the cone = optical center
+    aspect_ratio = np.eye(4)
+    aspect_ratio[0, 0] = W / H
+    transform = pose_c2w @ OPENGL @ aspect_ratio @ rot45
+    cam = trimesh.creation.cone(width, height, sections=4)  # , transform=transform)
+
+    if image is not None:
+        vertices = geotrf(transform, cam.vertices[[4, 5, 1, 3]])
+        faces = np.array([[0, 1, 2], [0, 2, 3], [2, 1, 0], [3, 2, 0]])
+        img = trimesh.Trimesh(vertices=vertices, faces=faces)
+        uv_coords = np.float32([[0, 0], [1, 0], [1, 1], [0, 1]])
+        img.visual = trimesh.visual.TextureVisuals(
+            uv_coords, image=PIL.Image.fromarray(image)
+        )
+        scene.add_geometry(img)
+
+    rot2 = np.eye(4)
+    rot2[:3, :3] = Rotation.from_euler("z", np.deg2rad(2)).as_matrix()
+    vertices = np.r_[cam.vertices, 0.95 * cam.vertices, geotrf(rot2, cam.vertices)]
+    vertices = geotrf(transform, vertices)
+    faces = []
+    for face in cam.faces:
+        if 0 in face:
+            continue
+        a, b, c = face
+        a2, b2, c2 = face + len(cam.vertices)
+        a3, b3, c3 = face + 2 * len(cam.vertices)
+
+        faces.append((a, b, b2))
+        faces.append((a, a2, c))
+        faces.append((c2, b, c))
+
+        faces.append((a, b, b3))
+        faces.append((a, a3, c))
+        faces.append((c3, b, c))
+
+    faces += [(c, b, a) for a, b, c in faces]
+
+    cam = trimesh.Trimesh(vertices=vertices, faces=faces)
+    cam.visual.face_colors[:, :3] = edge_color
+    scene.add_geometry(cam)
+
+    if marker == "o":
+        marker = trimesh.creation.icosphere(3, radius=screen_width / 4)
+        marker.vertices += pose_c2w[:3, 3]
+        marker.visual.face_colors[:, :3] = edge_color
+        scene.add_geometry(marker)
+
+
+def cat(a, b):
+    return np.concatenate((a.reshape(-1, 3), b.reshape(-1, 3)))
+
+
+OPENGL = np.array([[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]])
+
+
+CAM_COLORS = [
+    (255, 0, 0),
+    (0, 0, 255),
+    (0, 255, 0),
+    (255, 0, 255),
+    (255, 204, 0),
+    (0, 204, 204),
+    (128, 255, 255),
+    (255, 128, 255),
+    (255, 255, 128),
+    (0, 0, 0),
+    (128, 128, 128),
+]
+
+
+def uint8(colors):
+    if not isinstance(colors, np.ndarray):
+        colors = np.array(colors)
+    if np.issubdtype(colors.dtype, np.floating):
+        colors *= 255
+    assert 0 <= colors.min() and colors.max() < 256
+    return np.uint8(colors)
+
+
+def segment_sky(image):
+    import cv2
+    from scipy import ndimage
+
+    image = to_numpy(image)
+    if np.issubdtype(image.dtype, np.floating):
+        image = np.uint8(255 * image.clip(min=0, max=1))
+    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+
+    lower_blue = np.array([0, 0, 100])
+    upper_blue = np.array([30, 255, 255])
+    mask = cv2.inRange(hsv, lower_blue, upper_blue).view(bool)
+
+    mask |= (hsv[:, :, 1] < 10) & (hsv[:, :, 2] > 150)
+    mask |= (hsv[:, :, 1] < 30) & (hsv[:, :, 2] > 180)
+    mask |= (hsv[:, :, 1] < 50) & (hsv[:, :, 2] > 220)
+
+    kernel = np.ones((5, 5), np.uint8)
+    mask2 = ndimage.binary_opening(mask, structure=kernel)
+
+    _, labels, stats, _ = cv2.connectedComponentsWithStats(
+        mask2.view(np.uint8), connectivity=8
+    )
+    cc_sizes = stats[1:, cv2.CC_STAT_AREA]
+    order = cc_sizes.argsort()[::-1]  # bigger first
+    i = 0
+    selection = []
+    while i < len(order) and cc_sizes[order[i]] > cc_sizes[order[0]] / 2:
+        selection.append(1 + order[i])
+        i += 1
+    mask3 = np.in1d(labels, selection).reshape(labels.shape)
+
+    return torch.from_numpy(mask3)
+
+
+def get_vertical_colorbar(h, vmin, vmax, cmap_name="jet", label=None, cbar_precision=2):
+    """
+    :param w: pixels
+    :param h: pixels
+    :param vmin: min value
+    :param vmax: max value
+    :param cmap_name:
+    :param label
+    :return:
+    """
+    fig = Figure(figsize=(2, 8), dpi=100)
+    fig.subplots_adjust(right=1.5)
+    canvas = FigureCanvasAgg(fig)
+
+    ax = fig.add_subplot(111)
+    cmap = cm.get_cmap(cmap_name)
+    norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
+
+    tick_cnt = 6
+    tick_loc = np.linspace(vmin, vmax, tick_cnt)
+    cb1 = mpl.colorbar.ColorbarBase(
+        ax, cmap=cmap, norm=norm, ticks=tick_loc, orientation="vertical"
+    )
+
+    tick_label = [str(np.round(x, cbar_precision)) for x in tick_loc]
+    if cbar_precision == 0:
+        tick_label = [x[:-2] for x in tick_label]
+
+    cb1.set_ticklabels(tick_label)
+
+    cb1.ax.tick_params(labelsize=18, rotation=0)
+    if label is not None:
+        cb1.set_label(label)
+
+    fig.tight_layout()
+
+    canvas.draw()
+    s, (width, height) = canvas.print_to_buffer()
+
+    im = np.frombuffer(s, np.uint8).reshape((height, width, 4))
+
+    im = im[:, :, :3].astype(np.float32) / 255.0
+    if h != im.shape[0]:
+        w = int(im.shape[1] / im.shape[0] * h)
+        im = cv2.resize(im, (w, h), interpolation=cv2.INTER_AREA)
+
+    return im
+
+
+def colorize_np(
+    x,
+    cmap_name="jet",
+    mask=None,
+    range=None,
+    append_cbar=False,
+    cbar_in_image=False,
+    cbar_precision=2,
+):
+    """
+    turn a grayscale image into a color image
+    :param x: input grayscale, [H, W]
+    :param cmap_name: the colorization method
+    :param mask: the mask image, [H, W]
+    :param range: the range for scaling, automatic if None, [min, max]
+    :param append_cbar: if append the color bar
+    :param cbar_in_image: put the color bar inside the image to keep the output image the same size as the input image
+    :return: colorized image, [H, W]
+    """
+    if range is not None:
+        vmin, vmax = range
+    elif mask is not None:
+
+        vmin = np.min(x[mask][np.nonzero(x[mask])])
+        vmax = np.max(x[mask])
+
+        x[np.logical_not(mask)] = vmin
+
+    else:
+        vmin, vmax = np.percentile(x, (1, 100))
+        vmax += 1e-6
+
+    x = np.clip(x, vmin, vmax)
+    x = (x - vmin) / (vmax - vmin)
+
+    cmap = cm.get_cmap(cmap_name)
+    x_new = cmap(x)[:, :, :3]
+
+    if mask is not None:
+        mask = np.float32(mask[:, :, np.newaxis])
+        x_new = x_new * mask + np.ones_like(x_new) * (1.0 - mask)
+
+    cbar = get_vertical_colorbar(
+        h=x.shape[0],
+        vmin=vmin,
+        vmax=vmax,
+        cmap_name=cmap_name,
+        cbar_precision=cbar_precision,
+    )
+
+    if append_cbar:
+        if cbar_in_image:
+            x_new[:, -cbar.shape[1] :, :] = cbar
+        else:
+            x_new = np.concatenate(
+                (x_new, np.zeros_like(x_new[:, :5, :]), cbar), axis=1
+            )
+        return x_new
+    else:
+        return x_new
+
+
+def colorize(
+    x, cmap_name="jet", mask=None, range=None, append_cbar=False, cbar_in_image=False
+):
+    """
+    turn a grayscale image into a color image
+    :param x: torch.Tensor, grayscale image, [H, W] or [B, H, W]
+    :param mask: torch.Tensor or None, mask image, [H, W] or [B, H, W] or None
+    """
+
+    device = x.device
+    x = x.cpu().numpy()
+    if mask is not None:
+        mask = mask.cpu().numpy() > 0.99
+        kernel = np.ones((3, 3), np.uint8)
+
+    if x.ndim == 2:
+        x = x[None]
+        if mask is not None:
+            mask = mask[None]
+
+    out = []
+    for x_ in x:
+        if mask is not None:
+            mask = cv2.erode(mask.astype(np.uint8), kernel, iterations=1).astype(bool)
+
+        x_ = colorize_np(x_, cmap_name, mask, range, append_cbar, cbar_in_image)
+        out.append(torch.from_numpy(x_).to(device).float())
+    out = torch.stack(out).squeeze(0)
+    return out
+
+
+def draw_correspondences(
+    imgs1, imgs2, coords1, coords2, interval=10, color_by=0, radius=2
+):
+    """
+    draw correspondences between two images
+    :param img1: tensor [B, H, W, 3]
+    :param img2: tensor [B, H, W, 3]
+    :param coord1: tensor [B, N, 2]
+    :param coord2: tensor [B, N, 2]
+    :param interval: int the interval between two points
+    :param color_by: specify the color based on image 1 or image 2, 0 or 1
+    :return: [B, 2*H, W, 3]
+    """
+    batch_size = len(imgs1)
+    out = []
+    for i in range(batch_size):
+        img1 = imgs1[i].detach().cpu().numpy()
+        img2 = imgs2[i].detach().cpu().numpy()
+        coord1 = (
+            coords1[i].detach().cpu().numpy()[::interval, ::interval].reshape(-1, 2)
+        )
+        coord2 = (
+            coords2[i].detach().cpu().numpy()[::interval, ::interval].reshape(-1, 2)
+        )
+        img = drawMatches(
+            img1, img2, coord1, coord2, radius=radius, color_by=color_by, row_cat=True
+        )
+        out.append(img)
+    out = np.stack(out)
+    return out
+
+
+def draw_correspondences_lines(
+    imgs1, imgs2, coords1, coords2, interval=10, color_by=0, radius=2
+):
+    """
+    draw correspondences between two images
+    :param img1: tensor [B, H, W, 3]
+    :param img2: tensor [B, H, W, 3]
+    :param coord1: tensor [B, N, 2]
+    :param coord2: tensor [B, N, 2]
+    :param interval: int the interval between two points
+    :param color_by: specify the color based on image 1 or image 2, 0 or 1
+    :return: [B, 2*H, W, 3]
+    """
+    batch_size = len(imgs1)
+    out = []
+    for i in range(batch_size):
+        img1 = imgs1[i].detach().cpu().numpy()
+        img2 = imgs2[i].detach().cpu().numpy()
+        coord1 = (
+            coords1[i].detach().cpu().numpy()[::interval, ::interval].reshape(-1, 2)
+        )
+        coord2 = (
+            coords2[i].detach().cpu().numpy()[::interval, ::interval].reshape(-1, 2)
+        )
+        img = drawMatches_lines(
+            img1, img2, coord1, coord2, radius=radius, color_by=color_by, row_cat=True
+        )
+        out.append(img)
+    out = np.stack(out)
+    return out
+
+
+def drawMatches(img1, img2, kp1, kp2, radius=2, mask=None, color_by=0, row_cat=False):
+
+    h1, w1 = img1.shape[:2]
+    h2, w2 = img2.shape[:2]
+
+    img1 = np.ascontiguousarray(float2uint8(img1))
+    img2 = np.ascontiguousarray(float2uint8(img2))
+
+    center1 = np.median(kp1, axis=0)
+    center2 = np.median(kp2, axis=0)
+
+    set_max = range(128)
+    colors = {m: i for i, m in enumerate(set_max)}
+    colors = {
+        m: (255 * np.array(plt.cm.hsv(i / float(len(colors))))[:3][::-1]).astype(
+            np.int32
+        )
+        for m, i in colors.items()
+    }
+
+    if mask is not None:
+        ind = np.argsort(mask)[::-1]
+        kp1 = kp1[ind]
+        kp2 = kp2[ind]
+        mask = mask[ind]
+
+    for i, (pt1, pt2) in enumerate(zip(kp1, kp2)):
+
+        if color_by == 0:
+            coord_angle = np.arctan2(pt1[1] - center1[1], pt1[0] - center1[0])
+        elif color_by == 1:
+            coord_angle = np.arctan2(pt2[1] - center2[1], pt2[0] - center2[0])
+
+        corr_color = np.int32(64 * coord_angle / np.pi) % 128
+        color = tuple(colors[corr_color].tolist())
+
+        if (
+            (pt1[0] <= w1 - 1)
+            and (pt1[0] >= 0)
+            and (pt1[1] <= h1 - 1)
+            and (pt1[1] >= 0)
+        ):
+            img1 = cv2.circle(
+                img1, (int(pt1[0]), int(pt1[1])), radius, color, -1, cv2.LINE_AA
+            )
+
+        if (
+            (pt2[0] <= w2 - 1)
+            and (pt2[0] >= 0)
+            and (pt2[1] <= h2 - 1)
+            and (pt2[1] >= 0)
+        ):
+            if mask is not None and mask[i]:
+                img2 = cv2.drawMarker(
+                    img2,
+                    (int(pt2[0]), int(pt2[1])),
+                    color,
+                    markerType=cv2.MARKER_CROSS,
+                    markerSize=int(5 * radius),
+                    thickness=int(radius / 2),
+                    line_type=cv2.LINE_AA,
+                )
+            else:
+                img2 = cv2.circle(
+                    img2, (int(pt2[0]), int(pt2[1])), radius, color, -1, cv2.LINE_AA
+                )
+    if row_cat:
+        whole_img = np.concatenate([img1, img2], axis=0)
+    else:
+        whole_img = np.concatenate([img1, img2], axis=1)
+    return whole_img
+    if row_cat:
+        return np.concatenate([img1, img2], axis=0)
+    return np.concatenate([img1, img2], axis=1)
+
+
+def drawMatches_lines(
+    img1, img2, kp1, kp2, radius=2, mask=None, color_by=0, row_cat=False
+):
+
+    h1, w1 = img1.shape[:2]
+    h2, w2 = img2.shape[:2]
+
+    img1 = np.ascontiguousarray(float2uint8(img1))
+    img2 = np.ascontiguousarray(float2uint8(img2))
+
+    center1 = np.median(kp1, axis=0)
+    center2 = np.median(kp2, axis=0)
+
+    set_max = range(128)
+    colors = {m: i for i, m in enumerate(set_max)}
+    colors = {
+        m: (255 * np.array(plt.cm.hsv(i / float(len(colors))))[:3][::-1]).astype(
+            np.int32
+        )
+        for m, i in colors.items()
+    }
+
+    if mask is not None:
+        ind = np.argsort(mask)[::-1]
+        kp1 = kp1[ind]
+        kp2 = kp2[ind]
+        mask = mask[ind]
+
+    if row_cat:
+        whole_img = np.concatenate([img1, img2], axis=0)
+    else:
+        whole_img = np.concatenate([img1, img2], axis=1)
+    for i, (pt1, pt2) in enumerate(zip(kp1, kp2)):
+        if color_by == 0:
+            coord_angle = np.arctan2(pt1[1] - center1[1], pt1[0] - center1[0])
+        elif color_by == 1:
+            coord_angle = np.arctan2(pt2[1] - center2[1], pt2[0] - center2[0])
+
+        corr_color = np.int32(64 * coord_angle / np.pi) % 128
+        color = tuple(colors[corr_color].tolist())
+        rand_val = np.random.rand()
+        if rand_val < 0.1:
+            if (
+                (pt1[0] <= w1 - 1)
+                and (pt1[0] >= 0)
+                and (pt1[1] <= h1 - 1)
+                and (pt1[1] >= 0)
+            ) and (
+                (pt2[0] <= w2 - 1)
+                and (pt2[0] >= 0)
+                and (pt2[1] <= h2 - 1)
+                and (pt2[1] >= 0)
+            ):
+
+                whole_img = cv2.circle(
+                    whole_img,
+                    (int(pt1[0]), int(pt1[1])),
+                    radius,
+                    color,
+                    -1,
+                    cv2.LINE_AA,
+                )
+
+                if row_cat:
+                    whole_img = cv2.circle(
+                        whole_img,
+                        (int(pt2[0]), int(pt2[1] + h1)),
+                        radius,
+                        color,
+                        -1,
+                        cv2.LINE_AA,
+                    )
+                    cv2.line(
+                        whole_img,
+                        (int(pt1[0]), int(pt1[1])),
+                        (int(pt2[0]), int(pt2[1] + h1)),
+                        color,
+                        1,
+                        cv2.LINE_AA,
+                    )
+                else:
+                    whole_img = cv2.circle(
+                        whole_img,
+                        (int(pt2[0] + w1), int(pt2[1])),
+                        radius,
+                        color,
+                        -1,
+                        cv2.LINE_AA,
+                    )
+                    cv2.line(
+                        whole_img,
+                        (int(pt1[0]), int(pt1[1])),
+                        (int(pt2[0] + w1), int(pt2[1])),
+                        color,
+                        1,
+                        cv2.LINE_AA,
+                    )
+    return whole_img
+    if row_cat:
+        return np.concatenate([img1, img2], axis=0)
+    return np.concatenate([img1, img2], axis=1)
+
+
+import torch
+import os
+import time
+import viser
+
+
+def rotation_matrix_to_quaternion(R):
+    """
+    :param R: [3, 3]
+    :return: [4]
+    """
+    tr = np.trace(R)
+    Rxx = R[0, 0]
+    Ryy = R[1, 1]
+    Rzz = R[2, 2]
+    q = np.zeros(4)
+    q[0] = 0.5 * np.sqrt(1 + tr)
+    q[1] = (R[2, 1] - R[1, 2]) / (4 * q[0])
+    q[2] = (R[0, 2] - R[2, 0]) / (4 * q[0])
+    q[3] = (R[1, 0] - R[0, 1]) / (4 * q[0])
+    return q
+
+
+class PointCloudViewer:
+    def __init__(self, pc_dir, device="cpu"):
+        self.server = viser.ViserServer()
+        self.server.set_up_direction("-y")
+        self.device = device
+        self.tt = lambda x: torch.from_numpy(x).float().to(device)
+        self.pc_dir = pc_dir
+        self.pcs, self.all_steps = self.read_data()
+        self.num_frames = len(self.all_steps)
+
+        self.fix_camera = False
+        self.camera_scale = self.server.add_gui_slider(
+            "camera_scale",
+            min=0.01,
+            max=1.0,
+            step=0.01,
+            initial_value=0.1,
+        )
+
+        self.camera_handles = []
+
+    def read_data(self):
+        pc_list = os.listdir(self.pc_dir)
+        pc_list.sort(key=lambda x: int(x.split(".")[0].split("_")[-1]))
+        pcs = {}
+        step_list = []
+        for pc_name in pc_list:
+            pc = np.load(os.path.join(self.pc_dir, pc_name))
+            step = int(pc_name.split(".")[0].split("_")[-1])
+            pcs.update({step: {"pc": pc}})
+            step_list.append(step)
+        return pcs, step_list
+
+    def parse_pc_data(self, pc, batch_idx=-1):
+        idx = batch_idx
+        ret_dict = {}
+        for i in range(len(pc.keys()) // 2):
+            pred_pts = pc[f"pts3d_{i+1}"][idx].reshape(-1, 3)  # [N, 3]
+            color = pc[f"colors_{i+1}"][idx].reshape(-1, 3)  # [N, 3]
+            ret_dict.update({f"pred_pts_{i+1}": pred_pts, f"color_{i+1}": color})
+        return ret_dict
+
+    def add_pc(self, step):
+        pc = self.pcs[step]["pc"]
+        pc_dict = self.parse_pc_data(pc)
+
+        for i in range(len(pc_dict.keys()) // 2):
+            self.server.add_point_cloud(
+                name=f"/frames/{step}/pred_pts_{i+1}_{step}",
+                points=pc_dict[f"pred_pts_{i+1}"],
+                colors=pc_dict[f"color_{i+1}"],
+                point_size=0.002,
+            )
+
+        if not self.fix_camera:
+            raise NotImplementedError
+
+            R21, T21 = find_rigid_alignment_batched(
+                torch.from_numpy(pc_dict["pred_pts1_2"][None]),
+                torch.from_numpy(pc_dict["pred_pts1_1"][None]),
+            )
+            R12, T12 = find_rigid_alignment_batched(
+                torch.from_numpy(pc_dict["pred_pts2_1"][None]),
+                torch.from_numpy(pc_dict["pred_pts2_2"][None]),
+            )
+            R21 = R21[0].numpy()
+            T21 = T21.numpy()
+            R12 = R12[0].numpy()
+            T12 = T12.numpy()
+            pred_pts1_2 = pc_dict["pred_pts1_2"] @ R21.T + T21
+            pred_pts2_1 = pc_dict["pred_pts2_1"] @ R12.T + T12
+            self.server.add_point_cloud(
+                name=f"/frames/{step}/pred_pts1_2_{step}",
+                points=pred_pts1_2,
+                colors=pc_dict["color1_2"],
+                point_size=0.002,
+            )
+
+            self.server.add_point_cloud(
+                name=f"/frames/{step}/pred_pts2_1_{step}",
+                points=pred_pts2_1,
+                colors=pc_dict["color2_1"],
+                point_size=0.002,
+            )
+            img1 = pc_dict["color1_1"].reshape(224, 224, 3)
+            img2 = pc_dict["color2_2"].reshape(224, 224, 3)
+            self.camera_handles.append(
+                self.server.add_camera_frustum(
+                    name=f"/frames/{step}/camera1_{step}",
+                    fov=2.0 * np.arctan(224.0 / 490.0),
+                    aspect=1.0,
+                    scale=self.camera_scale.value,
+                    color=(1.0, 0, 0),
+                    image=img1,
+                )
+            )
+            self.camera_handles.append(
+                self.server.add_camera_frustum(
+                    name=f"/frames/{step}/camera2_{step}",
+                    fov=2.0 * np.arctan(224.0 / 490.0),
+                    aspect=1.0,
+                    scale=self.camera_scale.value,
+                    color=(0, 0, 1.0),
+                    wxyz=rotation_matrix_to_quaternion(R21),
+                    position=T21,
+                    image=img2,
+                )
+            )
+
+    def animate(self):
+        with self.server.add_gui_folder("Playback"):
+            gui_timestep = self.server.add_gui_slider(
+                "Train Step",
+                min=0,
+                max=self.num_frames - 1,
+                step=1,
+                initial_value=0,
+                disabled=True,
+            )
+            gui_next_frame = self.server.add_gui_button("Next Step", disabled=True)
+            gui_prev_frame = self.server.add_gui_button("Prev Step", disabled=True)
+            gui_playing = self.server.add_gui_checkbox("Playing", False)
+            gui_framerate = self.server.add_gui_slider(
+                "FPS", min=1, max=60, step=0.1, initial_value=1
+            )
+            gui_framerate_options = self.server.add_gui_button_group(
+                "FPS options", ("10", "20", "30", "60")
+            )
+
+        @gui_next_frame.on_click
+        def _(_) -> None:
+            gui_timestep.value = (gui_timestep.value + 1) % self.num_frames
+
+        @gui_prev_frame.on_click
+        def _(_) -> None:
+            gui_timestep.value = (gui_timestep.value - 1) % self.num_frames
+
+        @gui_playing.on_update
+        def _(_) -> None:
+            gui_timestep.disabled = gui_playing.value
+            gui_next_frame.disabled = gui_playing.value
+            gui_prev_frame.disabled = gui_playing.value
+
+        @gui_framerate_options.on_click
+        def _(_) -> None:
+            gui_framerate.value = int(gui_framerate_options.value)
+
+        prev_timestep = gui_timestep.value
+
+        @gui_timestep.on_update
+        def _(_) -> None:
+            nonlocal prev_timestep
+            current_timestep = gui_timestep.value
+            with self.server.atomic():
+                frame_nodes[current_timestep].visible = True
+                frame_nodes[prev_timestep].visible = False
+            prev_timestep = current_timestep
+            self.server.flush()  # Optional!
+
+        self.server.add_frame(
+            "/frames",
+            show_axes=False,
+        )
+        frame_nodes = []
+        for i in range(self.num_frames):
+            step = self.all_steps[i]
+            frame_nodes.append(
+                self.server.add_frame(
+                    f"/frames/{step}",
+                    show_axes=False,
+                )
+            )
+            self.add_pc(step)
+
+        for i, frame_node in enumerate(frame_nodes):
+
+            frame_node.visible = i == gui_timestep.value
+
+        prev_timestep = gui_timestep.value
+        while True:
+            if gui_playing.value:
+                gui_timestep.value = (gui_timestep.value + 1) % self.num_frames
+            for handle in self.camera_handles:
+                handle.scale = self.camera_scale.value
+            time.sleep(1.0 / gui_framerate.value)
+
+    def run(self):
+        self.animate()
+        while True:
+            time.sleep(10.0)
+
+
+from sklearn.decomposition import PCA
+
+
+def colorize_feature_map(x):
+    """
+    Args:
+        x: torch.Tensor, [B, H, W, D]
+    Returns:
+        torch.Tensor, [B, H, W, 3]
+    """
+    device = x.device
+    x = x.cpu().numpy()
+
+    out = []
+    for x_ in x:
+        x_ = colorize_feature_map_np(x_)
+        out.append(torch.from_numpy(x_).to(device))
+    out = torch.stack(out).squeeze(0)
+    return out
+
+
+def colorize_feature_map_np(x):
+    """
+    Args:
+        x: np.ndarray, [H, W, D]
+    """
+    pca = PCA(n_components=3)
+    pca_features = pca.fit_transform(x.reshape(-1, x.shape[-1]))
+
+    pca_features = (pca_features - pca_features.min()) / (
+        pca_features.max() - pca_features.min()
+    )
+    pca_features = pca_features.reshape(x.shape[0], x.shape[1], 3)
+    return pca_features
diff --git a/extern/CUT3R/src/train.py b/extern/CUT3R/src/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..1252b78008487cd6a9bdc78d1ce2e02d005f6280
--- /dev/null
+++ b/extern/CUT3R/src/train.py
@@ -0,0 +1,916 @@
+# --------------------------------------------------------
+# training code for CUT3R
+# --------------------------------------------------------
+# References:
+# DUSt3R: https://github.com/naver/dust3r
+# --------------------------------------------------------
+import argparse
+import datetime
+import json
+import numpy as np
+import os
+import sys
+import time
+import math
+from collections import defaultdict
+from pathlib import Path
+from typing import Sized
+
+import torch
+import torch.backends.cudnn as cudnn
+import torch.nn.functional as F
+from torch.utils.tensorboard import SummaryWriter
+
+torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
+
+from dust3r.model import (
+    PreTrainedModel,
+    ARCroco3DStereo,
+    ARCroco3DStereoConfig,
+    inf,
+    strip_module,
+)  # noqa: F401, needed when loading the model
+from dust3r.datasets import get_data_loader
+from dust3r.losses import *  # noqa: F401, needed when loading the model
+from dust3r.inference import loss_of_one_batch, loss_of_one_batch_tbptt  # noqa
+from dust3r.viz import colorize
+from dust3r.utils.render import get_render_results
+import dust3r.utils.path_to_croco  # noqa: F401
+import croco.utils.misc as misc  # noqa
+from croco.utils.misc import NativeScalerWithGradNormCount as NativeScaler  # noqa
+
+import hydra
+from omegaconf import OmegaConf
+import logging
+import pathlib
+from tqdm import tqdm
+import random
+import builtins
+import shutil
+
+from accelerate import Accelerator
+from accelerate import DistributedDataParallelKwargs, InitProcessGroupKwargs
+from accelerate.logging import get_logger
+from datetime import timedelta
+import torch.multiprocessing
+
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+printer = get_logger(__name__, log_level="DEBUG")
+
+
+def setup_for_distributed(accelerator: Accelerator):
+    """
+    This function disables printing when not in master process
+    """
+    builtin_print = builtins.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        force = force or (accelerator.num_processes > 8)
+        if accelerator.is_main_process or force:
+            now = datetime.datetime.now().time()
+            builtin_print("[{}] ".format(now), end="")  # print with time stamp
+            builtin_print(*args, **kwargs)
+
+    builtins.print = print
+
+
+def save_current_code(outdir):
+    now = datetime.datetime.now()  # current date and time
+    date_time = now.strftime("%m_%d-%H:%M:%S")
+    src_dir = "."
+    dst_dir = os.path.join(outdir, "code", "{}".format(date_time))
+    shutil.copytree(
+        src_dir,
+        dst_dir,
+        ignore=shutil.ignore_patterns(
+            ".vscode*",
+            "assets*",
+            "example*",
+            "checkpoints*",
+            "OLD*",
+            "logs*",
+            "out*",
+            "runs*",
+            "*.png",
+            "*.mp4",
+            "*__pycache__*",
+            "*.git*",
+            "*.idea*",
+            "*.zip",
+            "*.jpg",
+        ),
+        dirs_exist_ok=True,
+    )
+    return dst_dir
+
+
+def train(args):
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.accum_iter,
+        mixed_precision="bf16",
+        kwargs_handlers=[
+            DistributedDataParallelKwargs(find_unused_parameters=True),
+            InitProcessGroupKwargs(timeout=timedelta(seconds=6000)),
+        ],
+    )
+    device = accelerator.device
+
+    setup_for_distributed(accelerator)
+
+    printer.info("output_dir: " + args.output_dir)
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+
+    if accelerator.is_main_process:
+        dst_dir = save_current_code(outdir=args.output_dir)
+        printer.info(f"Saving current code to {dst_dir}")
+
+    # auto resume
+    if not args.resume:
+        last_ckpt_fname = os.path.join(args.output_dir, f"checkpoint-last.pth")
+        args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None
+
+    printer.info("job dir: {}".format(os.path.dirname(os.path.realpath(__file__))))
+
+    # fix the seed
+    seed = args.seed + accelerator.state.process_index
+    printer.info(
+        f"Setting seed to {seed} for process {accelerator.state.process_index}"
+    )
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    cudnn.benchmark = args.benchmark
+
+    # training dataset and loader
+    printer.info("Building train dataset %s", args.train_dataset)
+    #  dataset and loader
+    data_loader_train = build_dataset(
+        args.train_dataset,
+        args.batch_size,
+        args.num_workers,
+        accelerator=accelerator,
+        test=False,
+        fixed_length=args.fixed_length
+    )
+    printer.info("Building test dataset %s", args.test_dataset)
+    data_loader_test = {
+        dataset.split("(")[0]: build_dataset(
+            dataset,
+            args.batch_size,
+            args.num_workers,
+            accelerator=accelerator,
+            test=True,
+            fixed_length=True
+        )
+        for dataset in args.test_dataset.split("+")
+    }
+
+    # model
+    printer.info("Loading model: %s", args.model)
+    model: PreTrainedModel = eval(args.model)
+    printer.info(f"All model parameters: {sum(p.numel() for p in model.parameters())}")
+    printer.info(
+        f"Encoder parameters: {sum(p.numel() for p in model.enc_blocks.parameters())}"
+    )
+    printer.info(
+        f"Decoder parameters: {sum(p.numel() for p in model.dec_blocks.parameters())}"
+    )
+
+    printer.info(f">> Creating train criterion = {args.train_criterion}")
+    train_criterion = eval(args.train_criterion).to(device)
+    printer.info(
+        f">> Creating test criterion = {args.test_criterion or args.train_criterion}"
+    )
+    test_criterion = eval(args.test_criterion or args.criterion).to(device)
+
+    model.to(device)
+
+    if args.gradient_checkpointing:
+        model.gradient_checkpointing_enable()
+    if args.long_context:
+        model.fixed_input_length = False
+
+    if args.pretrained and not args.resume:
+        printer.info(f"Loading pretrained: {args.pretrained}")
+        ckpt = torch.load(args.pretrained, map_location=device)
+        load_only_encoder = getattr(args, "load_only_encoder", False)
+        if load_only_encoder:
+            filtered_state_dict = {
+                k: v
+                for k, v in ckpt["model"].items()
+                if "enc_blocks" in k or "patch_embed" in k
+            }
+            printer.info(
+                model.load_state_dict(strip_module(filtered_state_dict), strict=False)
+            )
+        else:
+            printer.info(
+                model.load_state_dict(strip_module(ckpt["model"]), strict=False)
+            )
+        del ckpt  # in case it occupies memory
+
+    # # following timm: set wd as 0 for bias and norm layers
+    param_groups = misc.get_parameter_groups(model, args.weight_decay)
+    optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
+    # print(optimizer)
+    loss_scaler = NativeScaler(accelerator=accelerator)
+
+    accelerator.even_batches = False
+    optimizer, model, data_loader_train = accelerator.prepare(
+        optimizer, model, data_loader_train
+    )
+
+    def write_log_stats(epoch, train_stats, test_stats):
+        if accelerator.is_main_process:
+            if log_writer is not None:
+                log_writer.flush()
+
+            log_stats = dict(
+                epoch=epoch, **{f"train_{k}": v for k, v in train_stats.items()}
+            )
+            for test_name in data_loader_test:
+                if test_name not in test_stats:
+                    continue
+                log_stats.update(
+                    {test_name + "_" + k: v for k, v in test_stats[test_name].items()}
+                )
+
+            with open(
+                os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8"
+            ) as f:
+                f.write(json.dumps(log_stats) + "\n")
+
+    def save_model(epoch, fname, best_so_far):
+        misc.save_model(
+            accelerator=accelerator,
+            args=args,
+            model_without_ddp=model,
+            optimizer=optimizer,
+            loss_scaler=loss_scaler,
+            epoch=epoch,
+            fname=fname,
+            best_so_far=best_so_far,
+        )
+
+    best_so_far = misc.load_model(
+        args=args, model_without_ddp=model, optimizer=optimizer, loss_scaler=loss_scaler
+    )
+    if best_so_far is None:
+        best_so_far = float("inf")
+    log_writer = (
+        SummaryWriter(log_dir=args.output_dir) if accelerator.is_main_process else None
+    )
+
+    printer.info(f"Start training for {args.epochs} epochs")
+    start_time = time.time()
+    train_stats = test_stats = {}
+
+    for epoch in range(args.start_epoch, args.epochs + 1):
+
+        # Save immediately the last checkpoint
+        if epoch > args.start_epoch:
+            if (
+                args.save_freq
+                and np.allclose(epoch / args.save_freq, int(epoch / args.save_freq))
+                or epoch == args.epochs
+            ):
+                save_model(epoch - 1, "last", best_so_far)
+
+        # Test on multiple datasets
+        new_best = False
+        if epoch > 0 and args.eval_freq > 0 and epoch % args.eval_freq == 0:
+            test_stats = {}
+            for test_name, testset in data_loader_test.items():
+                stats = test_one_epoch(
+                    model,
+                    test_criterion,
+                    testset,
+                    accelerator,
+                    device,
+                    epoch,
+                    log_writer=log_writer,
+                    args=args,
+                    prefix=test_name,
+                )
+                test_stats[test_name] = stats
+
+                # Save best of all
+                if stats["loss_med"] < best_so_far:
+                    best_so_far = stats["loss_med"]
+                    new_best = True
+        # Save more stuff
+        write_log_stats(epoch, train_stats, test_stats)
+
+        if epoch > args.start_epoch:
+            if args.keep_freq and epoch % args.keep_freq == 0:
+                save_model(epoch - 1, str(epoch), best_so_far)
+            if new_best:
+                save_model(epoch - 1, "best", best_so_far)
+        if epoch >= args.epochs:
+            break  # exit after writing last test to disk
+
+        # Train
+        train_stats = train_one_epoch(
+            model,
+            train_criterion,
+            data_loader_train,
+            optimizer,
+            accelerator,
+            epoch,
+            loss_scaler,
+            log_writer=log_writer,
+            args=args,
+        )
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    printer.info("Training time {}".format(total_time_str))
+
+    save_final_model(accelerator, args, args.epochs, model, best_so_far=best_so_far)
+
+
+def save_final_model(accelerator, args, epoch, model_without_ddp, best_so_far=None):
+    output_dir = Path(args.output_dir)
+    checkpoint_path = output_dir / "checkpoint-final.pth"
+    to_save = {
+        "args": args,
+        "model": (
+            model_without_ddp
+            if isinstance(model_without_ddp, dict)
+            else model_without_ddp.cpu().state_dict()
+        ),
+        "epoch": epoch,
+    }
+    if best_so_far is not None:
+        to_save["best_so_far"] = best_so_far
+    printer.info(f">> Saving model to {checkpoint_path} ...")
+    misc.save_on_master(accelerator, to_save, checkpoint_path)
+
+
+def build_dataset(dataset, batch_size, num_workers, accelerator, test=False, fixed_length=False):
+    split = ["Train", "Test"][test]
+    printer.info(f"Building {split} Data loader for dataset: {dataset}")
+    loader = get_data_loader(
+        dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_mem=True,
+        shuffle=not (test),
+        drop_last=not (test),
+        accelerator=accelerator,
+        fixed_length=fixed_length
+    )
+    return loader
+
+
+def train_one_epoch(
+    model: torch.nn.Module,
+    criterion: torch.nn.Module,
+    data_loader: Sized,
+    optimizer: torch.optim.Optimizer,
+    accelerator: Accelerator,
+    epoch: int,
+    loss_scaler,
+    args,
+    log_writer=None,
+):
+    assert torch.backends.cuda.matmul.allow_tf32 == True
+
+    model.train(True)
+    metric_logger = misc.MetricLogger(delimiter="  ")
+    metric_logger.add_meter("lr", misc.SmoothedValue(window_size=1, fmt="{value:.6f}"))
+    header = "Epoch: [{}]".format(epoch)
+    accum_iter = args.accum_iter
+
+    def save_model(epoch, fname, best_so_far):
+        misc.save_model(
+            accelerator=accelerator,
+            args=args,
+            model_without_ddp=model,
+            optimizer=optimizer,
+            loss_scaler=loss_scaler,
+            epoch=epoch,
+            fname=fname,
+            best_so_far=best_so_far,
+        )
+
+    if log_writer is not None:
+        printer.info("log_dir: {}".format(log_writer.log_dir))
+
+    if hasattr(data_loader, "dataset") and hasattr(data_loader.dataset, "set_epoch"):
+        data_loader.dataset.set_epoch(epoch)
+    if (
+        hasattr(data_loader, "batch_sampler")
+        and hasattr(data_loader.batch_sampler, "batch_sampler")
+        and hasattr(data_loader.batch_sampler.batch_sampler, "set_epoch")
+    ):
+        data_loader.batch_sampler.batch_sampler.set_epoch(epoch)
+
+    optimizer.zero_grad()
+
+    for data_iter_step, batch in enumerate(
+        metric_logger.log_every(data_loader, args.print_freq, accelerator, header)
+    ):
+        with accelerator.accumulate(model):
+            epoch_f = epoch + data_iter_step / len(data_loader)
+            step = int(epoch_f * len(data_loader))
+            # we use a per iteration (instead of per epoch) lr scheduler
+            if data_iter_step % accum_iter == 0:
+                misc.adjust_learning_rate(optimizer, epoch_f, args)
+            if not args.long_context:
+                result = loss_of_one_batch(
+                    batch,
+                    model,
+                    criterion,
+                    accelerator,
+                    symmetrize_batch=False,
+                    use_amp=bool(args.amp),
+                )
+            else:
+                result = loss_of_one_batch_tbptt(
+                    batch,
+                    model,
+                    criterion,
+                    chunk_size=4,
+                    loss_scaler=loss_scaler,
+                    optimizer=optimizer,
+                    accelerator=accelerator,
+                    symmetrize_batch=False,
+                    use_amp=bool(args.amp),
+                )
+            loss, loss_details = result["loss"]  # criterion returns two values
+
+            loss_value = float(loss)
+
+            if not math.isfinite(loss_value):
+                print(
+                    f"Loss is {loss_value}, stopping training, loss details: {loss_details}"
+                )
+                sys.exit(1)
+            if not result.get("already_backprop", False):
+                loss_scaler(
+                    loss,
+                    optimizer,
+                    parameters=model.parameters(),
+                    update_grad=True,
+                    clip_grad=1.0,
+                )
+                optimizer.zero_grad()
+
+            is_metric = batch[0]["is_metric"]
+            curr_num_view = len(batch)
+
+            del loss
+            tb_vis_img = (data_iter_step + 1) % accum_iter == 0 and (
+                (step + 1) % (args.print_img_freq)
+            ) == 0
+            if not tb_vis_img:
+                del batch
+            else:
+                torch.cuda.empty_cache()
+
+            lr = optimizer.param_groups[0]["lr"]
+            metric_logger.update(epoch=epoch_f)
+            metric_logger.update(lr=lr)
+            metric_logger.update(step=step)
+
+            metric_logger.update(loss=loss_value, **loss_details)
+
+            if (data_iter_step + 1) % accum_iter == 0 and (
+                (data_iter_step + 1) % (accum_iter * args.print_freq)
+            ) == 0:
+                loss_value_reduce = accelerator.gather(
+                    torch.tensor(loss_value).to(accelerator.device)
+                ).mean()  # MUST BE EXECUTED BY ALL NODES
+
+                if log_writer is None:
+                    continue
+                """ We use epoch_1000x as the x-axis in tensorboard.
+                This calibrates different curves when batch size changes.
+                """
+                epoch_1000x = int(epoch_f * 1000)
+                log_writer.add_scalar("train_loss", loss_value_reduce, step)
+                log_writer.add_scalar("train_lr", lr, step)
+                log_writer.add_scalar("train_iter", epoch_1000x, step)
+                for name, val in loss_details.items():
+                    if isinstance(val, torch.Tensor):
+                        if val.ndim > 0:
+                            continue
+                    if isinstance(val, dict):
+                        continue
+                    log_writer.add_scalar("train_" + name, val, step)
+
+            if tb_vis_img:
+                if log_writer is None:
+                    continue
+                with torch.no_grad():
+                    depths_self, gt_depths_self = get_render_results(
+                        batch, result["pred"], self_view=True
+                    )
+                    depths_cross, gt_depths_cross = get_render_results(
+                        batch, result["pred"], self_view=False
+                    )
+                    for k in range(len(batch)):
+                        loss_details[f"self_pred_depth_{k+1}"] = (
+                            depths_self[k].detach().cpu()
+                        )
+                        loss_details[f"self_gt_depth_{k+1}"] = (
+                            gt_depths_self[k].detach().cpu()
+                        )
+                        loss_details[f"pred_depth_{k+1}"] = (
+                            depths_cross[k].detach().cpu()
+                        )
+                        loss_details[f"gt_depth_{k+1}"] = (
+                            gt_depths_cross[k].detach().cpu()
+                        )
+
+                imgs_stacked_dict = get_vis_imgs_new(
+                    loss_details, args.num_imgs_vis, curr_num_view, is_metric=is_metric
+                )
+                for name, imgs_stacked in imgs_stacked_dict.items():
+                    log_writer.add_images(
+                        "train" + "/" + name, imgs_stacked, step, dataformats="HWC"
+                    )
+                del batch
+
+        if (
+            data_iter_step % int(args.save_freq * len(data_loader)) == 0
+            and data_iter_step != 0
+            and data_iter_step != len(data_loader) - 1
+        ):
+            print("saving at step", data_iter_step)
+            save_model(epoch - 1, "last", float("inf"))
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes(accelerator)
+    printer.info("Averaged stats: %s", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+@torch.no_grad()
+def test_one_epoch(
+    model: torch.nn.Module,
+    criterion: torch.nn.Module,
+    data_loader: Sized,
+    accelerator: Accelerator,
+    device: torch.device,
+    epoch: int,
+    args,
+    log_writer=None,
+    prefix="test",
+):
+
+    model.eval()
+    metric_logger = misc.MetricLogger(delimiter="  ")
+    metric_logger.meters = defaultdict(lambda: misc.SmoothedValue(window_size=9**9))
+    header = "Test Epoch: [{}]".format(epoch)
+
+    if log_writer is not None:
+        printer.info("log_dir: {}".format(log_writer.log_dir))
+
+    if hasattr(data_loader, "dataset") and hasattr(data_loader.dataset, "set_epoch"):
+        data_loader.dataset.set_epoch(0)
+    if (
+        hasattr(data_loader, "batch_sampler")
+        and hasattr(data_loader.batch_sampler, "batch_sampler")
+        and hasattr(data_loader.batch_sampler.batch_sampler, "set_epoch")
+    ):
+        data_loader.batch_sampler.batch_sampler.set_epoch(0)
+
+    for _, batch in enumerate(
+        metric_logger.log_every(data_loader, args.print_freq, accelerator, header)
+    ):
+        result = loss_of_one_batch(
+            batch,
+            model,
+            criterion,
+            accelerator,
+            symmetrize_batch=False,
+            use_amp=bool(args.amp),
+        )
+
+        loss_value, loss_details = result["loss"]  # criterion returns two values
+        metric_logger.update(loss=float(loss_value), **loss_details)
+
+    printer.info("Averaged stats: %s", metric_logger)
+
+    aggs = [("avg", "global_avg"), ("med", "median")]
+    results = {
+        f"{k}_{tag}": getattr(meter, attr)
+        for k, meter in metric_logger.meters.items()
+        for tag, attr in aggs
+    }
+
+    if log_writer is not None:
+        for name, val in results.items():
+            if isinstance(val, torch.Tensor):
+                if val.ndim > 0:
+                    continue
+            if isinstance(val, dict):
+                continue
+            log_writer.add_scalar(prefix + "_" + name, val, 1000 * epoch)
+
+        depths_self, gt_depths_self = get_render_results(
+            batch, result["pred"], self_view=True
+        )
+        depths_cross, gt_depths_cross = get_render_results(
+            batch, result["pred"], self_view=False
+        )
+        for k in range(len(batch)):
+            loss_details[f"self_pred_depth_{k+1}"] = depths_self[k].detach().cpu()
+            loss_details[f"self_gt_depth_{k+1}"] = gt_depths_self[k].detach().cpu()
+            loss_details[f"pred_depth_{k+1}"] = depths_cross[k].detach().cpu()
+            loss_details[f"gt_depth_{k+1}"] = gt_depths_cross[k].detach().cpu()
+
+        imgs_stacked_dict = get_vis_imgs_new(
+            loss_details,
+            args.num_imgs_vis,
+            args.num_test_views,
+            is_metric=batch[0]["is_metric"],
+        )
+        for name, imgs_stacked in imgs_stacked_dict.items():
+            log_writer.add_images(
+                prefix + "/" + name, imgs_stacked, 1000 * epoch, dataformats="HWC"
+            )
+
+    del loss_details, loss_value, batch
+    torch.cuda.empty_cache()
+
+    return results
+
+
+def batch_append(original_list, new_list):
+    for sublist, new_item in zip(original_list, new_list):
+        sublist.append(new_item)
+    return original_list
+
+
+def gen_mask_indicator(img_mask_list, ray_mask_list, num_views, h, w):
+    output = []
+    for img_mask, ray_mask in zip(img_mask_list, ray_mask_list):
+        out = torch.zeros((h, w * num_views, 3))
+        for i in range(num_views):
+            if img_mask[i] and not ray_mask[i]:
+                offset = 0
+            elif not img_mask[i] and ray_mask[i]:
+                offset = 1
+            else:
+                offset = 0.5
+            out[:, i * w : (i + 1) * w] += offset
+        output.append(out)
+    return output
+
+
+def vis_and_cat(
+    gt_imgs,
+    pred_imgs,
+    cross_gt_depths,
+    cross_pred_depths,
+    self_gt_depths,
+    self_pred_depths,
+    cross_conf,
+    self_conf,
+    ray_indicator,
+    is_metric,
+):
+    cross_depth_gt_min = torch.quantile(cross_gt_depths, 0.01).item()
+    cross_depth_gt_max = torch.quantile(cross_gt_depths, 0.99).item()
+    cross_depth_pred_min = torch.quantile(cross_pred_depths, 0.01).item()
+    cross_depth_pred_max = torch.quantile(cross_pred_depths, 0.99).item()
+    cross_depth_min = min(cross_depth_gt_min, cross_depth_pred_min)
+    cross_depth_max = max(cross_depth_gt_max, cross_depth_pred_max)
+
+    cross_gt_depths_vis = colorize(
+        cross_gt_depths,
+        range=(
+            (cross_depth_min, cross_depth_max)
+            if is_metric
+            else (cross_depth_gt_min, cross_depth_gt_max)
+        ),
+        append_cbar=True,
+    )
+    cross_pred_depths_vis = colorize(
+        cross_pred_depths,
+        range=(
+            (cross_depth_min, cross_depth_max)
+            if is_metric
+            else (cross_depth_pred_min, cross_depth_pred_max)
+        ),
+        append_cbar=True,
+    )
+
+    self_depth_gt_min = torch.quantile(self_gt_depths, 0.01).item()
+    self_depth_gt_max = torch.quantile(self_gt_depths, 0.99).item()
+    self_depth_pred_min = torch.quantile(self_pred_depths, 0.01).item()
+    self_depth_pred_max = torch.quantile(self_pred_depths, 0.99).item()
+    self_depth_min = min(self_depth_gt_min, self_depth_pred_min)
+    self_depth_max = max(self_depth_gt_max, self_depth_pred_max)
+
+    self_gt_depths_vis = colorize(
+        self_gt_depths,
+        range=(
+            (self_depth_min, self_depth_max)
+            if is_metric
+            else (self_depth_gt_min, self_depth_gt_max)
+        ),
+        append_cbar=True,
+    )
+    self_pred_depths_vis = colorize(
+        self_pred_depths,
+        range=(
+            (self_depth_min, self_depth_max)
+            if is_metric
+            else (self_depth_pred_min, self_depth_pred_max)
+        ),
+        append_cbar=True,
+    )
+    if len(cross_conf) > 0:
+        cross_conf_vis = colorize(cross_conf, append_cbar=True)
+    if len(self_conf) > 0:
+        self_conf_vis = colorize(self_conf, append_cbar=True)
+    gt_imgs_vis = torch.zeros_like(cross_gt_depths_vis)
+    gt_imgs_vis[: gt_imgs.shape[0], : gt_imgs.shape[1]] = gt_imgs
+    pred_imgs_vis = torch.zeros_like(cross_gt_depths_vis)
+    pred_imgs_vis[: pred_imgs.shape[0], : pred_imgs.shape[1]] = pred_imgs
+    ray_indicator_vis = torch.cat(
+        [
+            ray_indicator,
+            torch.zeros(
+                ray_indicator.shape[0],
+                cross_pred_depths_vis.shape[1] - ray_indicator.shape[1],
+                3,
+            ),
+        ],
+        dim=1,
+    )
+    out = torch.cat(
+        [
+            ray_indicator_vis,
+            gt_imgs_vis,
+            pred_imgs_vis,
+            self_gt_depths_vis,
+            self_pred_depths_vis,
+            self_conf_vis,
+            cross_gt_depths_vis,
+            cross_pred_depths_vis,
+            cross_conf_vis,
+        ],
+        dim=0,
+    )
+    return out
+
+
+def get_vis_imgs_new(loss_details, num_imgs_vis, num_views, is_metric):
+    ret_dict = {}
+    gt_img_list = [[] for _ in range(num_imgs_vis)]
+    pred_img_list = [[] for _ in range(num_imgs_vis)]
+
+    cross_gt_depth_list = [[] for _ in range(num_imgs_vis)]
+    cross_pred_depth_list = [[] for _ in range(num_imgs_vis)]
+
+    self_gt_depth_list = [[] for _ in range(num_imgs_vis)]
+    self_pred_depth_list = [[] for _ in range(num_imgs_vis)]
+
+    cross_view_conf_list = [[] for _ in range(num_imgs_vis)]
+    self_view_conf_list = [[] for _ in range(num_imgs_vis)]
+    cross_view_conf_exits = False
+    self_view_conf_exits = False
+
+    img_mask_list = [[] for _ in range(num_imgs_vis)]
+    ray_mask_list = [[] for _ in range(num_imgs_vis)]
+
+    if num_views > 30:
+        stride = 5
+    elif num_views > 20:
+        stride = 3
+    elif num_views > 10:
+        stride = 2
+    else:
+        stride = 1
+    for i in range(0, num_views, stride):
+        gt_imgs = 0.5 * (loss_details[f"gt_img{i+1}"] + 1)[:num_imgs_vis].detach().cpu()
+        width = gt_imgs.shape[2]
+        pred_imgs = (
+            0.5 * (loss_details[f"pred_rgb_{i+1}"] + 1)[:num_imgs_vis].detach().cpu()
+        )
+        gt_img_list = batch_append(gt_img_list, gt_imgs.unbind(dim=0))
+        pred_img_list = batch_append(pred_img_list, pred_imgs.unbind(dim=0))
+
+        cross_pred_depths = (
+            loss_details[f"pred_depth_{i+1}"][:num_imgs_vis].detach().cpu()
+        )
+        cross_gt_depths = (
+            loss_details[f"gt_depth_{i+1}"]
+            .to(gt_imgs.device)[:num_imgs_vis]
+            .detach()
+            .cpu()
+        )
+        cross_pred_depth_list = batch_append(
+            cross_pred_depth_list, cross_pred_depths.unbind(dim=0)
+        )
+        cross_gt_depth_list = batch_append(
+            cross_gt_depth_list, cross_gt_depths.unbind(dim=0)
+        )
+
+        self_gt_depths = (
+            loss_details[f"self_gt_depth_{i+1}"][:num_imgs_vis].detach().cpu()
+        )
+        self_pred_depths = (
+            loss_details[f"self_pred_depth_{i+1}"][:num_imgs_vis].detach().cpu()
+        )
+        self_gt_depth_list = batch_append(
+            self_gt_depth_list, self_gt_depths.unbind(dim=0)
+        )
+        self_pred_depth_list = batch_append(
+            self_pred_depth_list, self_pred_depths.unbind(dim=0)
+        )
+
+        if f"conf_{i+1}" in loss_details:
+            cross_view_conf = loss_details[f"conf_{i+1}"][:num_imgs_vis].detach().cpu()
+            cross_view_conf_list = batch_append(
+                cross_view_conf_list, cross_view_conf.unbind(dim=0)
+            )
+            cross_view_conf_exits = True
+
+        if f"self_conf_{i+1}" in loss_details:
+            self_view_conf = (
+                loss_details[f"self_conf_{i+1}"][:num_imgs_vis].detach().cpu()
+            )
+            self_view_conf_list = batch_append(
+                self_view_conf_list, self_view_conf.unbind(dim=0)
+            )
+            self_view_conf_exits = True
+
+        img_mask_list = batch_append(
+            img_mask_list,
+            loss_details[f"img_mask_{i+1}"][:num_imgs_vis].detach().cpu().unbind(dim=0),
+        )
+        ray_mask_list = batch_append(
+            ray_mask_list,
+            loss_details[f"ray_mask_{i+1}"][:num_imgs_vis].detach().cpu().unbind(dim=0),
+        )
+
+    # each element in the list is [H, num_views * W, (3)], the size of the list is num_imgs_vis
+    gt_img_list = [torch.cat(sublist, dim=1) for sublist in gt_img_list]
+    pred_img_list = [torch.cat(sublist, dim=1) for sublist in pred_img_list]
+    cross_pred_depth_list = [
+        torch.cat(sublist, dim=1) for sublist in cross_pred_depth_list
+    ]
+    cross_gt_depth_list = [torch.cat(sublist, dim=1) for sublist in cross_gt_depth_list]
+    self_gt_depth_list = [torch.cat(sublist, dim=1) for sublist in self_gt_depth_list]
+    self_pred_depth_list = [
+        torch.cat(sublist, dim=1) for sublist in self_pred_depth_list
+    ]
+    cross_view_conf_list = (
+        [torch.cat(sublist, dim=1) for sublist in cross_view_conf_list]
+        if cross_view_conf_exits
+        else []
+    )
+    self_view_conf_list = (
+        [torch.cat(sublist, dim=1) for sublist in self_view_conf_list]
+        if self_view_conf_exits
+        else []
+    )
+    # each elment in the list is [num_views,], the size of the list is num_imgs_vis
+    img_mask_list = [torch.stack(sublist, dim=0) for sublist in img_mask_list]
+    ray_mask_list = [torch.stack(sublist, dim=0) for sublist in ray_mask_list]
+
+    ray_indicator = gen_mask_indicator(
+        img_mask_list, ray_mask_list, len(img_mask_list[0]), 30, width
+    )
+
+    for i in range(num_imgs_vis):
+        out = vis_and_cat(
+            gt_img_list[i],
+            pred_img_list[i],
+            cross_gt_depth_list[i],
+            cross_pred_depth_list[i],
+            self_gt_depth_list[i],
+            self_pred_depth_list[i],
+            cross_view_conf_list[i],
+            self_view_conf_list[i],
+            ray_indicator[i],
+            is_metric[i],
+        )
+        ret_dict[f"imgs_{i}"] = out
+    return ret_dict
+
+
+@hydra.main(
+    version_base=None,
+    config_path=str(os.path.dirname(os.path.abspath(__file__))) + "/../config",
+    config_name="train.yaml",
+)
+def run(cfg: OmegaConf):
+    OmegaConf.resolve(cfg)
+    logdir = pathlib.Path(cfg.logdir)
+    logdir.mkdir(parents=True, exist_ok=True)
+    train(cfg)
+
+
+if __name__ == "__main__":
+    run()
diff --git a/extern/CUT3R/surfel_inference.py b/extern/CUT3R/surfel_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fafd6cfb210e6efa42f153f6dfd6bb3ba0091de
--- /dev/null
+++ b/extern/CUT3R/surfel_inference.py
@@ -0,0 +1,530 @@
+#!/usr/bin/env python3
+"""
+3D Point Cloud Inference and Visualization Script
+
+This script performs inference using the ARCroco3DStereo model and visualizes the
+resulting 3D point clouds with the PointCloudViewer. Use the command-line arguments
+to adjust parameters such as the model checkpoint path, image sequence directory,
+image size, device, etc.
+
+Usage:
+    python demo_ga.py [--model_path MODEL_PATH] [--seq_path SEQ_PATH] [--size IMG_SIZE]
+                            [--device DEVICE] [--vis_threshold VIS_THRESHOLD] [--output_dir OUT_DIR]
+
+Example:
+    python demo_ga.py --model_path src/cut3r_512_dpt_4_64.pth \
+        --seq_path examples/001 --device cuda --size 512
+"""
+
+import os
+import numpy as np
+import torch
+import time
+import glob
+import random
+import cv2
+import argparse
+import tempfile
+import shutil
+from copy import deepcopy
+from add_ckpt_path import add_path_to_dust3r
+import imageio.v2 as iio
+
+# Set random seed for reproducibility.
+random.seed(42)
+
+
+
+def listify(elems):
+    return [x for e in elems for x in e]
+
+
+def collate_with_cat(whatever, lists=False):
+    if isinstance(whatever, dict):
+        return {k: collate_with_cat(vals, lists=lists) for k, vals in whatever.items()}
+
+    elif isinstance(whatever, (tuple, list)):
+        if len(whatever) == 0:
+            return whatever
+        elem = whatever[0]
+        T = type(whatever)
+
+        if elem is None:
+            return None
+        if isinstance(elem, (bool, float, int, str)):
+            return whatever
+        if isinstance(elem, tuple):
+            return T(collate_with_cat(x, lists=lists) for x in zip(*whatever))
+        if isinstance(elem, dict):
+            return {
+                k: collate_with_cat([e[k] for e in whatever], lists=lists) for k in elem
+            }
+
+        if isinstance(elem, torch.Tensor):
+            return listify(whatever) if lists else torch.cat(whatever)
+        if isinstance(elem, np.ndarray):
+            return (
+                listify(whatever)
+                if lists
+                else torch.cat([torch.from_numpy(x) for x in whatever])
+            )
+
+        # otherwise, we just chain lists
+        return sum(whatever, T())
+
+
+def parse_args():
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Run 3D point cloud inference and visualization using ARCroco3DStereo."
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="src/cut3r_512_dpt_4_64.pth",
+        help="Path to the pretrained model checkpoint.",
+    )
+    parser.add_argument(
+        "--seq_path",
+        type=str,
+        default="",
+        help="Path to the directory containing the image sequence.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Device to run inference on (e.g., 'cuda' or 'cpu').",
+    )
+    parser.add_argument(
+        "--size",
+        type=int,
+        default="512",
+        help="Shape that input images will be rescaled to; if using 224+linear model, choose 224 otherwise 512",
+    )
+    parser.add_argument(
+        "--vis_threshold",
+        type=float,
+        default=1.5,
+        help="Visualization threshold for the point cloud viewer. Ranging from 1 to INF",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./demo_tmp",
+        help="value for tempfile.tempdir",
+    )
+
+    return parser.parse_args()
+
+
+def prepare_input(
+    img_paths, img_mask, size, raymaps=None, raymap_mask=None, revisit=1, update=True
+):
+    """
+    Prepare input views for inference from a list of image paths.
+
+    Args:
+        img_paths (list): List of image file paths.
+        img_mask (list of bool): Flags indicating valid images.
+        size (int): Target image size.
+        raymaps (list, optional): List of ray maps.
+        raymap_mask (list, optional): Flags indicating valid ray maps.
+        revisit (int): How many times to revisit each view.
+        update (bool): Whether to update the state on revisits.
+
+    Returns:
+        list: A list of view dictionaries.
+    """
+    # Import image loader (delayed import needed after adding ckpt path).
+    from src.dust3r.utils.image import load_images
+
+    images = load_images(img_paths, size=size)
+    num_views = len(images)
+    views = []
+
+    for i in range(num_views):
+        
+        view = {
+        "img": images[i]["img"],
+        "ray_map": torch.full(
+                (
+                    images[i]["img"].shape[0],
+                    6,
+                    images[i]["img"].shape[-2],
+                    images[i]["img"].shape[-1],
+                ),
+                torch.nan,
+            ),
+            "true_shape": torch.from_numpy(images[i]["true_shape"]),
+            "idx": i,
+            "instance": str(i),
+            "camera_pose": torch.from_numpy(np.eye(4).astype(np.float32)).unsqueeze(
+                0
+            ),
+            "img_mask": torch.tensor(True).unsqueeze(0),
+            "ray_mask": torch.tensor(False).unsqueeze(0),
+            "update": torch.tensor(True).unsqueeze(0),
+            "reset": torch.tensor(False).unsqueeze(0),
+        }
+        views.append(view)
+    return views
+
+
+def prepare_output(output, poses, depths, lr, niter, outdir, device, save_flag=False):
+    from cloud_opt.dust3r_opt import global_aligner, GlobalAlignerMode
+
+    with torch.enable_grad():
+        mode = GlobalAlignerMode.PointCloudOptimizer
+        
+        scene = global_aligner(
+            output,
+            device=device,
+            mode=mode,
+            verbose=True,
+        )
+        if depths is not None:
+            scene.preset_depth(depths)
+        if poses is not None:
+            scene.preset_pose(poses)
+        
+        loss = scene.compute_global_alignment(
+            init="mst",
+            niter=niter,
+            schedule="linear",
+            lr=lr,
+        )
+    scene.clean_pointcloud()
+    pts3d = scene.get_pts3d()
+    depths = scene.get_depthmaps()
+    poses = scene.get_im_poses()
+    focals = scene.get_focals()
+    pps = scene.get_principal_points()
+    confs = scene.get_conf(mode="none")
+
+    pts3ds_other = [pts.detach().cpu().unsqueeze(0) for pts in pts3d]
+    depths = [d.detach().cpu().unsqueeze(0) for d in depths]
+    colors = [torch.from_numpy(img).unsqueeze(0) for img in scene.imgs]
+    confs = [conf.detach().cpu().unsqueeze(0) for conf in confs]
+    cam_dict = {
+        "focal": focals.detach().cpu().numpy(),
+        "pp": pps.detach().cpu().numpy(),
+        "R": poses.detach().cpu().numpy()[..., :3, :3],
+        "t": poses.detach().cpu().numpy()[..., :3, 3],
+    }
+    if save_flag:
+        depths_tosave = torch.cat(depths)  # B, H, W
+        pts3ds_other_tosave = torch.cat(pts3ds_other)  # B, H, W, 3
+        conf_self_tosave = torch.cat(confs)  # B, H, W
+        colors_tosave = torch.cat(colors)  # [B, H, W, 3]
+        cam2world_tosave = poses.detach().cpu()  # B, 4, 4
+        intrinsics_tosave = (
+            torch.eye(3).unsqueeze(0).repeat(cam2world_tosave.shape[0], 1, 1)
+        )  # B, 3, 3
+        intrinsics_tosave[:, 0, 0] = focals[:, 0].detach().cpu()
+        intrinsics_tosave[:, 1, 1] = focals[:, 0].detach().cpu()
+        intrinsics_tosave[:, 0, 2] = pps[:, 0].detach().cpu()
+        intrinsics_tosave[:, 1, 2] = pps[:, 1].detach().cpu()
+
+    
+        os.makedirs(os.path.join(outdir, "depth"), exist_ok=True)
+        os.makedirs(os.path.join(outdir, "conf"), exist_ok=True)
+        os.makedirs(os.path.join(outdir, "color"), exist_ok=True)
+        os.makedirs(os.path.join(outdir, "camera"), exist_ok=True)
+        
+        for f_id in range(len(depths_tosave)):
+            depth = depths_tosave[f_id].cpu().numpy()
+            conf = conf_self_tosave[f_id].cpu().numpy()
+            color = colors_tosave[f_id].cpu().numpy()
+            c2w = cam2world_tosave[f_id].cpu().numpy()
+            intrins = intrinsics_tosave[f_id].cpu().numpy()
+            np.save(os.path.join(outdir, "depth", f"{f_id:06d}.npy"), depth)
+            np.save(os.path.join(outdir, "conf", f"{f_id:06d}.npy"), conf)
+            iio.imwrite(
+                os.path.join(outdir, "color", f"{f_id:06d}.png"),
+                (color * 255).astype(np.uint8),
+            )
+            np.savez(
+                os.path.join(outdir, "camera", f"{f_id:06d}.npz"),
+                pose=c2w,
+                intrinsics=intrins,
+            )
+
+    return pts3ds_other, colors, depths, confs, cam_dict
+
+
+def parse_seq_path(p):
+    if os.path.isdir(p):
+        img_paths = sorted(glob.glob(f"{p}/*"))
+        tmpdirname = None
+    else:
+        cap = cv2.VideoCapture(p)
+        if not cap.isOpened():
+            raise ValueError(f"Error opening video file {p}")
+        video_fps = cap.get(cv2.CAP_PROP_FPS)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        if video_fps == 0:
+            cap.release()
+            raise ValueError(f"Error: Video FPS is 0 for {p}")
+        frame_interval = 1
+        frame_indices = list(range(0, total_frames, frame_interval))
+        print(
+            f" - Video FPS: {video_fps}, Frame Interval: {frame_interval}, Total Frames to Read: {len(frame_indices)}"
+        )
+        img_paths = []
+        tmpdirname = tempfile.mkdtemp()
+        for i in frame_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame_path = os.path.join(tmpdirname, f"frame_{i}.jpg")
+            cv2.imwrite(frame_path, frame)
+            img_paths.append(frame_path)
+        cap.release()
+    return img_paths, tmpdirname
+
+
+
+
+def run_inference_from_pil(
+    pil_images, 
+    model,
+    poses=None,
+    depths=None,
+    lr = 0.01,
+    niter = 300,
+    device="cuda", 
+    size=512, 
+    output_dir="./demo_tmp", 
+    visualize=False, 
+    vis_threshold=1.5,
+    save_flag=False
+):
+    """
+    Run 3D reconstruction from a list of PIL images.
+    
+    Args:
+        pil_images (list): List of PIL image objects.
+        poses (list): List of camera poses.
+        model_path (str): Path to the pretrained model checkpoint.
+        device (str): Device to run inference on ('cuda' or 'cpu').
+        size (int): Target image size for processing.
+        output_dir (str): Directory to save outputs.
+        visualize (bool): Whether to launch the point cloud viewer.
+        vis_threshold (float): Visualization threshold for point cloud viewer.
+        
+    Returns:
+        dict: A dictionary containing the reconstruction results:
+            - point_clouds: List of point cloud tensors
+            - colors: List of color tensors
+            - confidences: List of confidence tensors
+            - camera_info: Camera parameters dictionary
+    """
+    # Set up the computation device
+    if device == "cuda" and not torch.cuda.is_available():
+        print("CUDA not available. Switching to CPU.")
+        device = "cpu"
+
+    # Add the checkpoint path (required for model imports in the dust3r package)
+    
+
+    # Import model and inference functions after adding the ckpt path
+    from src.dust3r.inference import inference, inference_recurrent
+ 
+    
+    # Prepare input views directly from PIL images
+    print(f"Processing {len(pil_images)} images...")
+    views = prepare_input_from_pil(
+        pil_images=pil_images,
+        size=size,
+        revisit=1,
+        update=True,
+    )
+
+    # Run inference
+    print("Running inference...")
+    start_time = time.time()
+    output = {
+        "view1": [],
+        "view2": [],
+        "pred1": [],
+        "pred2": [],
+    }
+    edges = []
+
+    outputs, state_args = inference(views, model, device)
+    for view_id in range(1, len(outputs["views"])):
+        output["view1"].append(outputs["views"][0])
+        output["view2"].append(outputs["views"][view_id])
+        output["pred1"].append(outputs["pred"][0])
+        output["pred2"].append(outputs["pred"][view_id])
+
+        edges.append((outputs["views"][0]["idx"], outputs["views"][view_id]["idx"]))
+
+    list_of_tuples = edges
+    sorted_indices = sorted(
+        range(len(list_of_tuples)),
+        key=lambda x: (
+            list_of_tuples[x][0] > list_of_tuples[x][1],  # Grouping condition
+            (
+                list_of_tuples[x][1]
+                if list_of_tuples[x][0] > list_of_tuples[x][1]
+                else list_of_tuples[x][0]
+            ),  # First sort key
+            (
+                list_of_tuples[x][0]
+                if list_of_tuples[x][0] > list_of_tuples[x][1]
+                else list_of_tuples[x][1]
+            ),  # Second sort key
+        ),
+    )
+    
+    new_output = {
+        "view1": [],
+        "view2": [],
+        "pred1": [],
+        "pred2": [],
+    }
+    for i in sorted_indices:
+        new_output["view1"].append(output["view1"][i])
+        new_output["view2"].append(output["view2"][i])
+        new_output["pred1"].append(output["pred1"][i])
+        new_output["pred2"].append(output["pred2"][i])
+        
+    output["view1"] = collate_with_cat(new_output["view1"])
+    output["view2"] = collate_with_cat(new_output["view2"])
+    output["pred1"] = collate_with_cat(new_output["pred1"])
+    output["pred2"] = collate_with_cat(new_output["pred2"])
+
+    total_time = time.time() - start_time
+    per_frame_time = total_time / len(views)
+    print(f"Inference completed in {total_time:.2f} seconds (average {per_frame_time:.2f} s per frame).")
+
+    # Process outputs
+    print("Processing reconstruction output...")
+    pts3ds_other, colors, depths, conf, cam_dict = prepare_output(output, poses, depths, lr, niter, output_dir, device, save_flag)
+    
+    # Create result dictionary
+    result = {
+        "point_clouds": pts3ds_other,
+        "colors": colors,
+        "depths": depths,
+        "confidences": conf,
+        "camera_info": cam_dict
+    }
+
+    # Visualize if requested
+    if visualize:
+        from viser_utils import PointCloudViewer
+        
+        # Convert tensors to numpy arrays for visualization
+        pts3ds_to_vis = [p.cpu().numpy() for p in pts3ds_other]
+        colors_to_vis = [c.cpu().numpy() for c in colors]
+        edge_colors = [None] * len(pts3ds_to_vis)
+
+        # Create and run the point cloud viewer
+        print("Launching point cloud viewer...")
+        viewer = PointCloudViewer(
+            model,
+            state_args,
+            pts3ds_to_vis,
+            colors_to_vis,
+            conf,
+            cam_dict,
+            device=device,
+            edge_color_list=edge_colors,
+            show_camera=True,
+            vis_threshold=vis_threshold,
+            size=size,
+        )
+        viewer.run()
+    
+    return result
+
+
+def prepare_input_from_pil(
+    pil_images, size, square_ok=False, raymaps=None, raymap_mask=None, revisit=1, update=True
+):
+    """
+    Prepare input views for inference from a list of PIL images.
+
+    Args:
+        pil_images (list): List of PIL image objects.
+        size (int): Target image size.
+        raymaps (list, optional): List of ray maps.
+        raymap_mask (list, optional): Flags indicating valid ray maps.
+        revisit (int): How many times to revisit each view.
+        update (bool): Whether to update the state on revisits.
+
+    Returns:
+        list: A list of view dictionaries.
+    """
+    # Import needed utilities (delayed import needed after adding ckpt path)
+    from src.dust3r.utils.image import _resize_pil_image, ImgNorm, exif_transpose
+    import PIL
+    
+    # Process PIL images to have the same format as the load_images output
+    imgs = []
+    for i, img in enumerate(pil_images):
+        # Convert to RGB to ensure consistency
+        img = exif_transpose(img).convert("RGB")
+        W1, H1 = img.size
+        if size == 224:
+
+            img = _resize_pil_image(img, round(size * max(W1 / H1, H1 / W1)))
+        else:
+
+            img = _resize_pil_image(img, size)
+        W, H = img.size
+        cx, cy = W // 2, H // 2
+        if size == 224:
+            half = min(cx, cy)
+            img = img.crop((cx - half, cy - half, cx + half, cy + half))
+        else:
+            halfw, halfh = ((2 * cx) // 16) * 8, ((2 * cy) // 16) * 8
+            if not (square_ok) and W == H:
+                halfh = 3 * halfw / 4
+            img = img.crop((cx - halfw, cy - halfh, cx + halfw, cy + halfh))
+
+        
+        # Create dictionary with the same structure as in load_images
+        imgs.append({
+            "img": ImgNorm(img)[None],  # Using ImgNorm for normalization
+            "true_shape": np.int32([img.size[::-1]]),
+            "idx": i,
+            "instance": str(i),
+        })
+    
+    # Prepare views similar to prepare_input
+    views = []
+    num_views = len(imgs)
+    
+    for i in range(num_views):
+        view = {
+            "img": imgs[i]["img"],
+            "ray_map": torch.full(
+                (
+                    imgs[i]["img"].shape[0],
+                    6,
+                    imgs[i]["img"].shape[-2],
+                    imgs[i]["img"].shape[-1],
+                ),
+                torch.nan,
+            ),
+            "true_shape": torch.from_numpy(imgs[i]["true_shape"]),
+            "idx": i,
+            "instance": str(i),
+            "camera_pose": torch.from_numpy(np.eye(4).astype(np.float32)).unsqueeze(0),
+            "img_mask": torch.tensor(True).unsqueeze(0),
+            "ray_mask": torch.tensor(False).unsqueeze(0),
+            "update": torch.tensor(True).unsqueeze(0),
+            "reset": torch.tensor(False).unsqueeze(0),
+        }
+        views.append(view)
+    
+    return views
+
+
+
+
diff --git a/extern/CUT3R/viser_utils.py b/extern/CUT3R/viser_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..22713a9d57be034c2db2da5c640ca774a65aec52
--- /dev/null
+++ b/extern/CUT3R/viser_utils.py
@@ -0,0 +1,777 @@
+import torch
+import os
+from matplotlib.figure import Figure
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+import matplotlib as mpl
+import cv2
+import numpy as np
+import matplotlib.cm as cm
+import viser
+import viser.transforms as tf
+import time
+import trimesh
+import dataclasses
+from scipy.spatial.transform import Rotation
+from src.dust3r.viz import (
+    add_scene_cam,
+    CAM_COLORS,
+    OPENGL,
+    pts3d_to_trimesh,
+    cat_meshes,
+)
+
+
+def todevice(batch, device, callback=None, non_blocking=False):
+    """Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy).
+
+    batch: list, tuple, dict of tensors or other things
+    device: pytorch device or 'numpy'
+    callback: function that would be called on every sub-elements.
+    """
+    if callback:
+        batch = callback(batch)
+
+    if isinstance(batch, dict):
+        return {k: todevice(v, device) for k, v in batch.items()}
+
+    if isinstance(batch, (tuple, list)):
+        return type(batch)(todevice(x, device) for x in batch)
+
+    x = batch
+    if device == "numpy":
+        if isinstance(x, torch.Tensor):
+            x = x.detach().cpu().numpy()
+    elif x is not None:
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x)
+        if torch.is_tensor(x):
+            x = x.to(device, non_blocking=non_blocking)
+    return x
+
+
+to_device = todevice  # alias
+
+
+def to_numpy(x):
+    return todevice(x, "numpy")
+
+
+def segment_sky(image):
+    import cv2
+    from scipy import ndimage
+
+    # Convert to HSV
+    image = to_numpy(image)
+    if np.issubdtype(image.dtype, np.floating):
+        image = np.uint8(255 * image.clip(min=0, max=1))
+    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+
+    # Define range for blue color and create mask
+    lower_blue = np.array([0, 0, 100])
+    upper_blue = np.array([30, 255, 255])
+    mask = cv2.inRange(hsv, lower_blue, upper_blue).view(bool)
+
+    # add luminous gray
+    mask |= (hsv[:, :, 1] < 10) & (hsv[:, :, 2] > 150)
+    mask |= (hsv[:, :, 1] < 30) & (hsv[:, :, 2] > 180)
+    mask |= (hsv[:, :, 1] < 50) & (hsv[:, :, 2] > 220)
+
+    # Morphological operations
+    kernel = np.ones((5, 5), np.uint8)
+    mask2 = ndimage.binary_opening(mask, structure=kernel)
+
+    # keep only largest CC
+    _, labels, stats, _ = cv2.connectedComponentsWithStats(
+        mask2.view(np.uint8), connectivity=8
+    )
+    cc_sizes = stats[1:, cv2.CC_STAT_AREA]
+    order = cc_sizes.argsort()[::-1]  # bigger first
+    i = 0
+    selection = []
+    while i < len(order) and cc_sizes[order[i]] > cc_sizes[order[0]] / 2:
+        selection.append(1 + order[i])
+        i += 1
+    mask3 = np.in1d(labels, selection).reshape(labels.shape)
+
+    # Apply mask
+    return torch.from_numpy(mask3)
+
+
+def convert_scene_output_to_glb(
+    outdir,
+    imgs,
+    pts3d,
+    mask,
+    focals,
+    cams2world,
+    cam_size=0.05,
+    show_cam=True,
+    cam_color=None,
+    as_pointcloud=False,
+    transparent_cams=False,
+    silent=False,
+    save_name=None,
+):
+    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
+    pts3d = to_numpy(pts3d)
+    imgs = to_numpy(imgs)
+    focals = to_numpy(focals)
+    cams2world = to_numpy(cams2world)
+
+    scene = trimesh.Scene()
+
+    # full pointcloud
+    if as_pointcloud:
+        pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+        col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
+        pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
+        scene.add_geometry(pct)
+    else:
+        meshes = []
+        for i in range(len(imgs)):
+            meshes.append(pts3d_to_trimesh(imgs[i], pts3d[i], mask[i]))
+        mesh = trimesh.Trimesh(**cat_meshes(meshes))
+        scene.add_geometry(mesh)
+
+    # add each camera
+    if show_cam:
+        for i, pose_c2w in enumerate(cams2world):
+            if isinstance(cam_color, list):
+                camera_edge_color = cam_color[i]
+            else:
+                camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+            add_scene_cam(
+                scene,
+                pose_c2w,
+                camera_edge_color,
+                None if transparent_cams else imgs[i],
+                focals[i],
+                imsize=imgs[i].shape[1::-1],
+                screen_width=cam_size,
+            )
+
+    rot = np.eye(4)
+    rot[:3, :3] = Rotation.from_euler("y", np.deg2rad(180)).as_matrix()
+    scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
+    if save_name is None:
+        save_name = "scene"
+    outfile = os.path.join(outdir, save_name + ".glb")
+    if not silent:
+        print("(exporting 3D scene to", outfile, ")")
+    scene.export(file_obj=outfile)
+    return outfile
+
+
+@dataclasses.dataclass
+class CameraState(object):
+    fov: float
+    aspect: float
+    c2w: np.ndarray
+
+    def get_K(self, img_wh):
+        W, H = img_wh
+        focal_length = H / 2.0 / np.tan(self.fov / 2.0)
+        K = np.array(
+            [
+                [focal_length, 0.0, W / 2.0],
+                [0.0, focal_length, H / 2.0],
+                [0.0, 0.0, 1.0],
+            ]
+        )
+        return K
+
+
+def get_vertical_colorbar(h, vmin, vmax, cmap_name="jet", label=None, cbar_precision=2):
+    """
+    :param w: pixels
+    :param h: pixels
+    :param vmin: min value
+    :param vmax: max value
+    :param cmap_name:
+    :param label
+    :return:
+    """
+    fig = Figure(figsize=(2, 8), dpi=100)
+    fig.subplots_adjust(right=1.5)
+    canvas = FigureCanvasAgg(fig)
+
+    ax = fig.add_subplot(111)
+    cmap = cm.get_cmap(cmap_name)
+    norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
+
+    tick_cnt = 6
+    tick_loc = np.linspace(vmin, vmax, tick_cnt)
+    cb1 = mpl.colorbar.ColorbarBase(
+        ax, cmap=cmap, norm=norm, ticks=tick_loc, orientation="vertical"
+    )
+
+    tick_label = [str(np.round(x, cbar_precision)) for x in tick_loc]
+    if cbar_precision == 0:
+        tick_label = [x[:-2] for x in tick_label]
+
+    cb1.set_ticklabels(tick_label)
+
+    cb1.ax.tick_params(labelsize=18, rotation=0)
+    if label is not None:
+        cb1.set_label(label)
+
+    canvas.draw()
+    s, (width, height) = canvas.print_to_buffer()
+
+    im = np.frombuffer(s, np.uint8).reshape((height, width, 4))
+
+    im = im[:, :, :3].astype(np.float32) / 255.0
+    if h != im.shape[0]:
+        w = int(im.shape[1] / im.shape[0] * h)
+        im = cv2.resize(im, (w, h), interpolation=cv2.INTER_AREA)
+
+    return im
+
+
+def colorize_np(
+    x,
+    cmap_name="jet",
+    mask=None,
+    range=None,
+    append_cbar=False,
+    cbar_in_image=False,
+    cbar_precision=2,
+):
+    """
+    turn a grayscale image into a color image
+    :param x: input grayscale, [H, W]
+    :param cmap_name: the colorization method
+    :param mask: the mask image, [H, W]
+    :param range: the range for scaling, automatic if None, [min, max]
+    :param append_cbar: if append the color bar
+    :param cbar_in_image: put the color bar inside the image to keep the output image the same size as the input image
+    :return: colorized image, [H, W]
+    """
+    if range is not None:
+        vmin, vmax = range
+    elif mask is not None:
+
+        vmin = np.min(x[mask][np.nonzero(x[mask])])
+        vmax = np.max(x[mask])
+
+        x[np.logical_not(mask)] = vmin
+
+    else:
+        vmin, vmax = np.percentile(x, (1, 100))
+        vmax += 1e-6
+
+    x = np.clip(x, vmin, vmax)
+    x = (x - vmin) / (vmax - vmin)
+
+    cmap = cm.get_cmap(cmap_name)
+    x_new = cmap(x)[:, :, :3]
+
+    if mask is not None:
+        mask = np.float32(mask[:, :, np.newaxis])
+        x_new = x_new * mask + np.ones_like(x_new) * (1.0 - mask)
+
+    cbar = get_vertical_colorbar(
+        h=x.shape[0],
+        vmin=vmin,
+        vmax=vmax,
+        cmap_name=cmap_name,
+        cbar_precision=cbar_precision,
+    )
+
+    if append_cbar:
+        if cbar_in_image:
+            x_new[:, -cbar.shape[1] :, :] = cbar
+        else:
+            x_new = np.concatenate(
+                (x_new, np.zeros_like(x_new[:, :5, :]), cbar), axis=1
+            )
+        return x_new
+    else:
+        return x_new
+
+
+def colorize(
+    x, cmap_name="jet", mask=None, range=None, append_cbar=False, cbar_in_image=False
+):
+    """
+    turn a grayscale image into a color image
+    :param x: torch.Tensor, grayscale image, [H, W] or [B, H, W]
+    :param mask: torch.Tensor or None, mask image, [H, W] or [B, H, W] or None
+    """
+
+    device = x.device
+    x = x.cpu().numpy()
+    if mask is not None:
+        mask = mask.cpu().numpy() > 0.99
+        kernel = np.ones((3, 3), np.uint8)
+
+    if x.ndim == 2:
+        x = x[None]
+        if mask is not None:
+            mask = mask[None]
+
+    out = []
+    for x_ in x:
+        if mask is not None:
+            mask = cv2.erode(mask.astype(np.uint8), kernel, iterations=1).astype(bool)
+
+        x_ = colorize_np(x_, cmap_name, mask, range, append_cbar, cbar_in_image)
+        out.append(torch.from_numpy(x_).to(device).float())
+    out = torch.stack(out).squeeze(0)
+    return out
+
+
+class PointCloudViewer:
+    def __init__(
+        self,
+        model,
+        state_args,
+        pc_list,
+        color_list,
+        conf_list,
+        cam_dict,
+        image_mask=None,
+        edge_color_list=None,
+        device="cpu",
+        port=8080,
+        show_camera=True,
+        vis_threshold=1,
+        size=512
+    ):
+        self.model = model
+        self.size=size
+        self.state_args = state_args
+        self.server = viser.ViserServer(port=port)
+        self.server.set_up_direction("-y")
+        self.device = device
+        self.conf_list = conf_list
+        self.vis_threshold = vis_threshold
+        self.tt = lambda x: torch.from_numpy(x).float().to(device)
+        self.pcs, self.all_steps = self.read_data(
+            pc_list, color_list, conf_list, edge_color_list
+        )
+        self.cam_dict = cam_dict
+        self.num_frames = len(self.all_steps)
+        self.image_mask = image_mask
+        self.show_camera = show_camera
+        self.on_replay = False
+        self.vis_pts_list = []
+        self.traj_list = []
+        self.orig_img_list = [x[0] for x in color_list]
+        self.via_points = []
+
+        gui_reset_up = self.server.gui.add_button(
+            "Reset up direction",
+            hint="Set the camera control 'up' direction to the current camera's 'up'.",
+        )
+
+        @gui_reset_up.on_click
+        def _(event: viser.GuiEvent) -> None:
+            client = event.client
+            assert client is not None
+            client.camera.up_direction = tf.SO3(client.camera.wxyz) @ np.array(
+                [0.0, -1.0, 0.0]
+            )
+
+        button3 = self.server.gui.add_button("4D (Only Show Current Frame)")
+        button4 = self.server.gui.add_button("3D (Show All Frames)")
+        self.is_render = False
+        self.fourd = False
+
+        @button3.on_click
+        def _(event: viser.GuiEvent) -> None:
+            self.fourd = True
+
+        @button4.on_click
+        def _(event: viser.GuiEvent) -> None:
+            self.fourd = False
+
+        self.focal_slider = self.server.add_gui_slider(
+            "Focal Length",
+            min=0.1,
+            max=99999,
+            step=1,
+            initial_value=533,
+        )
+
+        self.psize_slider = self.server.add_gui_slider(
+            "Point Size",
+            min=0.0001,
+            max=0.1,
+            step=0.0001,
+            initial_value=0.0005,
+        )
+        self.camsize_slider = self.server.add_gui_slider(
+            "Camera Size",
+            min=0.01,
+            max=0.5,
+            step=0.01,
+            initial_value=0.1,
+        )
+
+        self.pc_handles = []
+        self.cam_handles = []
+
+        @self.psize_slider.on_update
+        def _(_) -> None:
+            for handle in self.pc_handles:
+                handle.point_size = self.psize_slider.value
+
+        @self.camsize_slider.on_update
+        def _(_) -> None:
+            for handle in self.cam_handles:
+                handle.scale = self.camsize_slider.value
+                handle.line_thickness = 0.03 * handle.scale
+
+        self.server.on_client_connect(self._connect_client)
+
+    def get_camera_state(self, client: viser.ClientHandle) -> CameraState:
+        camera = client.camera
+        c2w = np.concatenate(
+            [
+                np.concatenate(
+                    [tf.SO3(camera.wxyz).as_matrix(), camera.position[:, None]], 1
+                ),
+                [[0, 0, 0, 1]],
+            ],
+            0,
+        )
+        return CameraState(
+            fov=camera.fov,
+            aspect=camera.aspect,
+            c2w=c2w,
+        )
+
+    @staticmethod
+    def generate_pseudo_intrinsics(h, w):
+        focal = (h**2 + w**2) ** 0.5
+        return np.array([[focal, 0, w // 2], [0, focal, h // 2], [0, 0, 1]]).astype(
+            np.float32
+        )
+
+    def get_ray_map(self, c2w, h, w, intrinsics=None):
+        if intrinsics is None:
+            intrinsics = self.generate_pseudo_intrinsics(h, w)
+        i, j = np.meshgrid(np.arange(w), np.arange(h), indexing="xy")
+        grid = np.stack([i, j, np.ones_like(i)], axis=-1)
+        ro = c2w[:3, 3]
+        rd = np.linalg.inv(intrinsics) @ grid.reshape(-1, 3).T
+        rd = (c2w @ np.vstack([rd, np.ones_like(rd[0])])).T[:, :3].reshape(h, w, 3)
+        rd = rd / np.linalg.norm(rd, axis=-1, keepdims=True)
+        ro = np.broadcast_to(ro, (h, w, 3))
+        ray_map = np.concatenate([ro, rd], axis=-1)
+        return ray_map
+
+    def set_camera_loc(camera, pose, K):
+        """
+        pose: 4x4 matrix
+        K: 3x3 matrix
+        """
+        fx, fy = K[0, 0], K[1, 1]
+        cx, cy = K[0, 2], K[1, 2]
+        aspect = float(cx) / float(cy)
+        fov = 2 * np.arctan(2 * cx / fx)
+        wxyz_xyz = tf.SE3.from_matrix(pose).wxyz_xyz
+        wxyz = wxyz_xyz[:4]
+        xyz = wxyz_xyz[4:]
+        camera.wxyz = wxyz
+        camera.position = xyz
+        camera.fov = fov
+        camera.aspect = aspect
+
+    def _connect_client(self, client: viser.ClientHandle):
+        from src.dust3r.inference import inference_step
+        from src.dust3r.utils.geometry import geotrf
+
+        wxyz_panel = client.gui.add_text("wxyz:", f"{client.camera.wxyz}")
+        position_panel = client.gui.add_text("position:", f"{client.camera.position}")
+        fov_panel = client.gui.add_text(
+            "fov:", f"{2 * np.arctan(self.size/self.focal_slider.value) * 180 / np.pi}"
+        )
+        aspect_panel = client.gui.add_text("aspect:", "1.0")
+
+        @client.camera.on_update
+        def _(_: viser.CameraHandle):
+            with self.server.atomic():
+                wxyz_panel.value = f"{client.camera.wxyz}"
+                position_panel.value = f"{client.camera.position}"
+                fov_panel.value = (
+                    f"{2 * np.arctan(self.size/self.focal_slider.value) * 180 / np.pi}"
+                )
+                aspect_panel.value = "1.0"
+
+        gui_set_current_camera = client.gui.add_button(
+            "Set Current Camera to Infer Raymap"
+        )
+
+        @gui_set_current_camera.on_click
+        def _(_) -> None:
+            try:
+                cam = self.get_camera_state(client)
+                cam.fov = 2 * np.arctan(self.size / self.focal_slider.value)
+                cam.aspect = (512 / 384) if self.size==512 else 1.0
+                pose = cam.c2w
+                if self.size == 512:
+                    intrins = self.generate_pseudo_intrinsics(384, 512)
+                    raymap = torch.from_numpy(self.get_ray_map(pose, 384, 512, intrins))[
+                        None
+                    ].float()
+                else:
+                    intrins = self.generate_pseudo_intrinsics(224, 224)
+                    raymap = torch.from_numpy(self.get_ray_map(pose, 224, 224, intrins))[
+                        None
+                    ].float()
+                
+                
+                view = {
+                    "img": torch.full((1, 3, 384, 512), torch.nan) if self.size==512 else torch.full((1, 3, 224, 224), torch.nan),
+                    "ray_map": raymap,
+                    "true_shape": torch.from_numpy(np.int32([raymap.shape[1:-1]])),
+                    "idx": self.num_frames + 1,
+                    "instance": str(self.num_frames + 1),
+                    "camera_pose": torch.from_numpy(np.eye(4).astype(np.float32)).unsqueeze(
+                        0
+                    ),
+                    "img_mask": torch.tensor(False).unsqueeze(0),
+                    "ray_mask": torch.tensor(True).unsqueeze(0),
+                    "update": torch.tensor(False).unsqueeze(0),
+                    "reset": torch.tensor(False).unsqueeze(0),
+                }
+                print("Start Inference Raymap")
+                output = inference_step(
+                    view, self.state_args[-1], self.model, device=self.device
+                )
+                print("Finish Inference Raymap")
+                pts3ds = output["pred"]["pts3d_in_self_view"].cpu().numpy()
+                pts3ds = geotrf(pose[None], pts3ds)
+                colors = 0.5 * (output["pred"]["rgb"].cpu().numpy() + 1.0)
+                depthmap = output["pred"]["pts3d_in_self_view"].cpu().numpy()[0][..., -1]
+                conf = output["pred"]["conf"].cpu().numpy()
+                disp = 1.0 / depthmap
+                pts3ds, colors = self.parse_pc_data(pts3ds, colors, set_border_color=True)
+                mask = (conf > 1.0).reshape(-1)
+                self.num_frames += 1
+                self.pc_handles.append(
+                    self.server.add_point_cloud(
+                        name=f"/frames/{self.num_frames-1}/pred_pts",
+                        points=pts3ds[mask],
+                        colors=colors[mask],
+                        point_size=0.005,
+                    )
+                )
+
+                self.server.add_camera_frustum(
+                    name=f"/frames/{self.num_frames-1}/camera",
+                    fov=cam.fov,
+                    aspect=cam.aspect,
+                    wxyz=client.camera.wxyz,
+                    position=client.camera.position,
+                    scale=0.1,
+                    color=[64, 179, 230],
+                )
+                print("Adding new pointcloud: ", pts3ds.shape)
+            except Exception as e:
+                print(e)
+
+    @staticmethod
+    def set_color_border(image, border_width=5, color=[1, 0, 0]):
+
+        image[:border_width, :, 0] = color[0]  # Red channel
+        image[:border_width, :, 1] = color[1]  # Green channel
+        image[:border_width, :, 2] = color[2]  # Blue channel
+        image[-border_width:, :, 0] = color[0]
+        image[-border_width:, :, 1] = color[1]
+        image[-border_width:, :, 2] = color[2]
+
+        image[:, :border_width, 0] = color[0]
+        image[:, :border_width, 1] = color[1]
+        image[:, :border_width, 2] = color[2]
+        image[:, -border_width:, 0] = color[0]
+        image[:, -border_width:, 1] = color[1]
+        image[:, -border_width:, 2] = color[2]
+
+        return image
+
+    def read_data(self, pc_list, color_list, conf_list, edge_color_list=None):
+        pcs = {}
+        step_list = []
+        for i, pc in enumerate(pc_list):
+            step = i
+            pcs.update(
+                {
+                    step: {
+                        "pc": pc,
+                        "color": color_list[i],
+                        "conf": conf_list[i],
+                        "edge_color": (
+                            None if edge_color_list[i] is None else edge_color_list[i]
+                        ),
+                    }
+                }
+            )
+            step_list.append(step)
+        normalized_indices = (
+            np.array(list(range(len(pc_list))))
+            / np.array(list(range(len(pc_list)))).max()
+        )
+        cmap = cm.viridis
+        self.camera_colors = cmap(normalized_indices)
+        return pcs, step_list
+
+    def parse_pc_data(
+        self,
+        pc,
+        color,
+        conf=None,
+        edge_color=[0.251, 0.702, 0.902],
+        set_border_color=False,
+    ):
+
+        pred_pts = pc.reshape(-1, 3)  # [N, 3]
+
+        if set_border_color and edge_color is not None:
+            color = self.set_color_border(color[0], color=edge_color)
+        if np.isnan(color).any():
+
+            color = np.zeros((pred_pts.shape[0], 3))
+            color[:, 2] = 1
+        else:
+            color = color.reshape(-1, 3)
+        if conf is not None:
+            conf = conf[0].reshape(-1)
+            pred_pts = pred_pts[conf > self.vis_threshold]
+            color = color[conf > self.vis_threshold]
+        return pred_pts, color
+
+    def add_pc(self, step):
+        pc = self.pcs[step]["pc"]
+        color = self.pcs[step]["color"]
+        conf = self.pcs[step]["conf"]
+        edge_color = self.pcs[step].get("edge_color", None)
+
+        pred_pts, color = self.parse_pc_data(
+            pc, color, conf, edge_color, set_border_color=True
+        )
+
+        self.vis_pts_list.append(pred_pts)
+        self.pc_handles.append(
+            self.server.add_point_cloud(
+                name=f"/frames/{step}/pred_pts",
+                points=pred_pts,
+                colors=color,
+                point_size=0.0005,
+            )
+        )
+
+    def add_camera(self, step):
+        cam = self.cam_dict
+        focal = cam["focal"][step]
+        pp = cam["pp"][step]
+        R = cam["R"][step]
+        t = cam["t"][step]
+
+        q = tf.SO3.from_matrix(R).wxyz
+        fov = 2 * np.arctan(pp[0] / focal)
+        aspect = pp[0] / pp[1]
+        self.traj_list.append((q, t))
+        self.cam_handles.append(
+            self.server.add_camera_frustum(
+                name=f"/frames/{step}/camera",
+                fov=fov,
+                aspect=aspect,
+                wxyz=q,
+                position=t,
+                scale=0.1,
+                color=(50, 205, 50),
+            )
+        )
+
+    def animate(self):
+        with self.server.add_gui_folder("Playback"):
+            gui_timestep = self.server.add_gui_slider(
+                "Train Step",
+                min=0,
+                max=self.num_frames - 1,
+                step=1,
+                initial_value=0,
+                disabled=False,
+            )
+            gui_next_frame = self.server.add_gui_button("Next Step", disabled=False)
+            gui_prev_frame = self.server.add_gui_button("Prev Step", disabled=False)
+            gui_playing = self.server.add_gui_checkbox("Playing", False)
+            gui_framerate = self.server.add_gui_slider(
+                "FPS", min=1, max=60, step=0.1, initial_value=1
+            )
+            gui_framerate_options = self.server.add_gui_button_group(
+                "FPS options", ("10", "20", "30", "60")
+            )
+
+        @gui_next_frame.on_click
+        def _(_) -> None:
+            gui_timestep.value = (gui_timestep.value + 1) % self.num_frames
+
+        @gui_prev_frame.on_click
+        def _(_) -> None:
+            gui_timestep.value = (gui_timestep.value - 1) % self.num_frames
+
+        @gui_playing.on_update
+        def _(_) -> None:
+            gui_timestep.disabled = gui_playing.value
+            gui_next_frame.disabled = gui_playing.value
+            gui_prev_frame.disabled = gui_playing.value
+
+        @gui_framerate_options.on_click
+        def _(_) -> None:
+            gui_framerate.value = int(gui_framerate_options.value)
+
+        prev_timestep = gui_timestep.value
+
+        @gui_timestep.on_update
+        def _(_) -> None:
+            nonlocal prev_timestep
+            current_timestep = gui_timestep.value
+            with self.server.atomic():
+                self.frame_nodes[current_timestep].visible = True
+                self.frame_nodes[prev_timestep].visible = False
+            prev_timestep = current_timestep
+            self.server.flush()  # Optional!
+
+        self.server.add_frame(
+            "/frames",
+            show_axes=False,
+        )
+        self.frame_nodes = []
+        for i in range(self.num_frames):
+            step = self.all_steps[i]
+            self.frame_nodes.append(
+                self.server.add_frame(
+                    f"/frames/{step}",
+                    show_axes=False,
+                )
+            )
+            self.add_pc(step)
+            if self.show_camera:
+                self.add_camera(step)
+
+        prev_timestep = gui_timestep.value
+        while True:
+            if self.on_replay:
+                pass
+            else:
+                if gui_playing.value:
+                    gui_timestep.value = (gui_timestep.value + 1) % self.num_frames
+
+                for i, frame_node in enumerate(self.frame_nodes):
+                    frame_node.visible = (
+                        i <= gui_timestep.value
+                        if not self.fourd
+                        else i == gui_timestep.value
+                    )
+
+            time.sleep(1.0 / gui_framerate.value)
+
+    def run(self):
+        self.animate()
+        while True:
+            time.sleep(10.0)
diff --git a/modeling/__init__.py b/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..31ae9b25690d66f94ea05d7697ecb1b0fdd0ad17
--- /dev/null
+++ b/modeling/__init__.py
@@ -0,0 +1,2 @@
+from .network import VMemWrapper, VMemModel, VMemModelParams
+from .pipeline import VMemPipeline
diff --git a/modeling/__pycache__/__init__.cpython-310.pyc b/modeling/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81777b01e800379054e83b8f75b92820b45be747
Binary files /dev/null and b/modeling/__pycache__/__init__.cpython-310.pyc differ
diff --git a/modeling/__pycache__/__init__.cpython-39.pyc b/modeling/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d7422a86674fd6aec105cce2e68b70d84672fdf
Binary files /dev/null and b/modeling/__pycache__/__init__.cpython-39.pyc differ
diff --git a/modeling/__pycache__/metrics.cpython-310.pyc b/modeling/__pycache__/metrics.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f8ec912ca223526fc33d95ee8b8eed5a977856c
Binary files /dev/null and b/modeling/__pycache__/metrics.cpython-310.pyc differ
diff --git a/modeling/__pycache__/metrics.cpython-39.pyc b/modeling/__pycache__/metrics.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cad1b3eb5f16dddd79ebc0fb78bcd3d13f5f022d
Binary files /dev/null and b/modeling/__pycache__/metrics.cpython-39.pyc differ
diff --git a/modeling/__pycache__/network.cpython-310.pyc b/modeling/__pycache__/network.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e24ba59db526ed096396bbdec7231924af14080
Binary files /dev/null and b/modeling/__pycache__/network.cpython-310.pyc differ
diff --git a/modeling/__pycache__/network.cpython-39.pyc b/modeling/__pycache__/network.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0dbc222345defb5bcfc4b2b25f2436a28bbb62e
Binary files /dev/null and b/modeling/__pycache__/network.cpython-39.pyc differ
diff --git a/modeling/__pycache__/pipeline.cpython-310.pyc b/modeling/__pycache__/pipeline.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06fbf50264bfec53238e42eeeffbbe93ebc72309
Binary files /dev/null and b/modeling/__pycache__/pipeline.cpython-310.pyc differ
diff --git a/modeling/__pycache__/pipeline.cpython-39.pyc b/modeling/__pycache__/pipeline.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b6f448a85aa3681679b6a0220e05991327304e2
Binary files /dev/null and b/modeling/__pycache__/pipeline.cpython-39.pyc differ
diff --git a/modeling/__pycache__/sampling.cpython-310.pyc b/modeling/__pycache__/sampling.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3cbc4ce9d6be98aa8c91ca5e89a7b70a5aa73b65
Binary files /dev/null and b/modeling/__pycache__/sampling.cpython-310.pyc differ
diff --git a/modeling/__pycache__/sampling.cpython-39.pyc b/modeling/__pycache__/sampling.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd43fa38580534f6c3529b97598859e04e057c5e
Binary files /dev/null and b/modeling/__pycache__/sampling.cpython-39.pyc differ
diff --git a/modeling/modules/__init__.py b/modeling/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/modeling/modules/__pycache__/__init__.cpython-310.pyc b/modeling/modules/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26d1a67ae75b5fc828f74a222441e37ef31c5c49
Binary files /dev/null and b/modeling/modules/__pycache__/__init__.cpython-310.pyc differ
diff --git a/modeling/modules/__pycache__/__init__.cpython-39.pyc b/modeling/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efd5e8224015e51a22c022aa0c00bf89f45fa5c0
Binary files /dev/null and b/modeling/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/modeling/modules/__pycache__/autoencoder.cpython-310.pyc b/modeling/modules/__pycache__/autoencoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87778f69249fb5e35307f1855ad733f77fe83ab7
Binary files /dev/null and b/modeling/modules/__pycache__/autoencoder.cpython-310.pyc differ
diff --git a/modeling/modules/__pycache__/autoencoder.cpython-39.pyc b/modeling/modules/__pycache__/autoencoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60d49145c3a9166b44d1d4ba1444f01ef86cd0f0
Binary files /dev/null and b/modeling/modules/__pycache__/autoencoder.cpython-39.pyc differ
diff --git a/modeling/modules/__pycache__/conditioner.cpython-310.pyc b/modeling/modules/__pycache__/conditioner.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb41ca7fb58be6e6259e01bcd5e8f5111eca0907
Binary files /dev/null and b/modeling/modules/__pycache__/conditioner.cpython-310.pyc differ
diff --git a/modeling/modules/__pycache__/conditioner.cpython-39.pyc b/modeling/modules/__pycache__/conditioner.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96556f864510732dcf0da9b0dfd49f92ab92e445
Binary files /dev/null and b/modeling/modules/__pycache__/conditioner.cpython-39.pyc differ
diff --git a/modeling/modules/__pycache__/layers.cpython-310.pyc b/modeling/modules/__pycache__/layers.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3cf2ed931e59d19d34bf3c949063a9a226172da7
Binary files /dev/null and b/modeling/modules/__pycache__/layers.cpython-310.pyc differ
diff --git a/modeling/modules/__pycache__/layers.cpython-39.pyc b/modeling/modules/__pycache__/layers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3311172c3a35a44ad3de8fb64194fa8e17ee560b
Binary files /dev/null and b/modeling/modules/__pycache__/layers.cpython-39.pyc differ
diff --git a/modeling/modules/__pycache__/transformer.cpython-310.pyc b/modeling/modules/__pycache__/transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ef794d66edc3dc9e95f34267f90abf5da0b3899
Binary files /dev/null and b/modeling/modules/__pycache__/transformer.cpython-310.pyc differ
diff --git a/modeling/modules/__pycache__/transformer.cpython-39.pyc b/modeling/modules/__pycache__/transformer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89aa02981a9f87c1f44c33e817b0856f68dfc50f
Binary files /dev/null and b/modeling/modules/__pycache__/transformer.cpython-39.pyc differ
diff --git a/modeling/modules/autoencoder.py b/modeling/modules/autoencoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd2d6292e3bc134bea45d2007fac3522eefaaf59
--- /dev/null
+++ b/modeling/modules/autoencoder.py
@@ -0,0 +1,51 @@
+import torch
+from diffusers.models import AutoencoderKL  # type: ignore
+from torch import nn
+
+
+class AutoEncoder(nn.Module):
+    scale_factor: float = 0.18215
+    downsample: int = 8
+
+    def __init__(self, chunk_size: int):
+        super().__init__()
+        self.module = AutoencoderKL.from_pretrained(
+            "stabilityai/stable-diffusion-2-1-base",
+            subfolder="vae",
+            force_download=False,
+            low_cpu_mem_usage=False,
+        )
+        self.module.eval().requires_grad_(False)  # type: ignore
+        self.chunk_size = chunk_size
+
+    def _encode(self, x: torch.Tensor) -> torch.Tensor:
+        return (
+            self.module.encode(x).latent_dist.mean  # type: ignore
+            * self.scale_factor
+        )
+
+    def encode(self, x: torch.Tensor, chunk_size=None) -> torch.Tensor:
+        chunk_size = chunk_size or self.chunk_size
+        if chunk_size is not None:
+            return torch.cat(
+                [self._encode(x_chunk) for x_chunk in x.split(chunk_size)],
+                dim=0,
+            )
+        else:
+            return self._encode(x)
+
+    def _decode(self, z: torch.Tensor) -> torch.Tensor:
+        return self.module.decode(z / self.scale_factor).sample  # type: ignore
+
+    def decode(self, z: torch.Tensor, chunk_size=None) -> torch.Tensor:
+        chunk_size = chunk_size or self.chunk_size
+        if chunk_size is not None:
+            return torch.cat(
+                [self._decode(z_chunk) for z_chunk in z.split(chunk_size)],
+                dim=0,
+            )
+        else:
+            return self._decode(z)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.decode(self.encode(x))
diff --git a/modeling/modules/conditioner.py b/modeling/modules/conditioner.py
new file mode 100644
index 0000000000000000000000000000000000000000..31915d778c2ca0b118ba424bcb201fe35bf15e09
--- /dev/null
+++ b/modeling/modules/conditioner.py
@@ -0,0 +1,39 @@
+import kornia
+import open_clip
+import torch
+from torch import nn
+
+
+class CLIPConditioner(nn.Module):
+    mean: torch.Tensor
+    std: torch.Tensor
+
+    def __init__(self):
+        super().__init__()
+        self.module = open_clip.create_model_and_transforms(
+            "ViT-H-14", pretrained="laion2b_s32b_b79k"
+        )[0]
+        self.module.eval().requires_grad_(False)  # type: ignore
+        self.register_buffer(
+            "mean", torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False
+        )
+        self.register_buffer(
+            "std", torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False
+        )
+
+    def preprocess(self, x: torch.Tensor) -> torch.Tensor:
+        x = kornia.geometry.resize(
+            x,
+            (224, 224),
+            interpolation="bicubic",
+            align_corners=True,
+            antialias=True,
+        )
+        x = (x + 1.0) / 2.0
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.preprocess(x)
+        x = self.module.encode_image(x)
+        return x
diff --git a/modeling/modules/layers.py b/modeling/modules/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..57f9410611ee8737910767f96cde7db6eb23acc5
--- /dev/null
+++ b/modeling/modules/layers.py
@@ -0,0 +1,141 @@
+import math
+
+import torch
+import torch.nn.functional as F
+from einops import repeat
+from torch import nn
+
+from .transformer import MultiviewTransformer
+
+from typing import Union
+
+def timestep_embedding(
+    timesteps: torch.Tensor,
+    dim: int,
+    max_period: int = 10000,
+    repeat_only: bool = False,
+) -> torch.Tensor:
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+    else:
+        embedding = repeat(timesteps, "b -> b d", d=dim)
+    return embedding
+
+
+class Upsample(nn.Module):
+    def __init__(self, channels: int, out_channels: Union[int, None] = None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.conv = nn.Conv2d(self.channels, self.out_channels, 3, 1, 1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.shape[1] == self.channels
+        x = F.interpolate(x, scale_factor=2, mode="nearest")
+        x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+    def __init__(self, channels: int, out_channels: Union[int, None] = None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.op = nn.Conv2d(self.channels, self.out_channels, 3, 2, 1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.shape[1] == self.channels
+        return self.op(x)
+
+
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input.float()).type(input.dtype)
+
+
+class TimestepEmbedSequential(nn.Sequential):
+    def forward(  # type: ignore[override]
+        self,
+        x: torch.Tensor,
+        emb: torch.Tensor,
+        context: torch.Tensor,
+        dense_emb: torch.Tensor,
+        num_frames: int,
+    ) -> torch.Tensor:
+        for layer in self:
+            if isinstance(layer, MultiviewTransformer):
+                assert num_frames is not None
+                x = layer(x, context, num_frames)
+            elif isinstance(layer, ResBlock):
+                x = layer(x, emb, dense_emb)
+            else:
+                x = layer(x)
+        return x
+
+
+class ResBlock(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        emb_channels: int,
+        out_channels: Union[int, None],
+        dense_in_channels: int,
+        dropout: float,
+    ):
+        super().__init__()
+        out_channels = out_channels or channels
+
+
+        self.in_layers = nn.Sequential(
+            GroupNorm32(32, channels),
+            nn.SiLU(),
+            nn.Conv2d(channels, out_channels, 3, 1, 1),
+        )
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(), nn.Linear(emb_channels, out_channels)
+        )
+        self.dense_emb_layers = nn.Sequential(
+            nn.Conv2d(dense_in_channels, 2 * channels, 1, 1, 0)
+        )
+        self.out_layers = nn.Sequential(
+            GroupNorm32(32, out_channels),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv2d(out_channels, out_channels, 3, 1, 1),
+        )
+        if out_channels == channels:
+            self.skip_connection = nn.Identity()
+        else:
+            self.skip_connection = nn.Conv2d(channels, out_channels, 1, 1, 0)
+
+    def forward(
+        self, x: torch.Tensor, emb: torch.Tensor, dense_emb: torch.Tensor
+    ) -> torch.Tensor:
+        in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+        h = in_rest(x)
+        dense = self.dense_emb_layers(
+            F.interpolate(
+                dense_emb, size=h.shape[2:], mode="bilinear", align_corners=True
+            )
+        ).type(h.dtype)
+        dense_scale, dense_shift = torch.chunk(dense, 2, dim=1)
+        h = h * (1 + dense_scale) + dense_shift
+        h = in_conv(h)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        h = h + emb_out
+        h = self.out_layers(h)
+        h = self.skip_connection(x) + h
+        return h
diff --git a/modeling/modules/preprocessor.py b/modeling/modules/preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5794463b3bb6892d5311b94c296bb83ea5245bf
--- /dev/null
+++ b/modeling/modules/preprocessor.py
@@ -0,0 +1,116 @@
+import contextlib
+import os
+import os.path as osp
+import sys
+from typing import cast
+
+import imageio.v3 as iio
+import numpy as np
+import torch
+
+
+class Dust3rPipeline(object):
+    def __init__(self, device: str | torch.device = "cuda"):
+        submodule_path = osp.realpath(
+            osp.join(osp.dirname(__file__), "../../third_party/dust3r/")
+        )
+        if submodule_path not in sys.path:
+            sys.path.insert(0, submodule_path)
+        try:
+            with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
+                from dust3r.cloud_opt import (  # type: ignore[import]
+                    GlobalAlignerMode,
+                    global_aligner,
+                )
+                from dust3r.image_pairs import make_pairs  # type: ignore[import]
+                from dust3r.inference import inference  # type: ignore[import]
+                from dust3r.model import AsymmetricCroCo3DStereo  # type: ignore[import]
+                from dust3r.utils.image import load_images  # type: ignore[import]
+        except ImportError:
+            raise ImportError(
+                "Missing required submodule: 'dust3r'. Please ensure that all submodules are properly set up.\n\n"
+                "To initialize them, run the following command in the project root:\n"
+                "  git submodule update --init --recursive"
+            )
+
+        self.device = torch.device(device)
+        self.model = AsymmetricCroCo3DStereo.from_pretrained(
+            "naver/DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+        ).to(self.device)
+
+        self._GlobalAlignerMode = GlobalAlignerMode
+        self._global_aligner = global_aligner
+        self._make_pairs = make_pairs
+        self._inference = inference
+        self._load_images = load_images
+
+    def infer_cameras_and_points(
+        self,
+        img_paths: list[str],
+        Ks: list[list] = None,
+        c2ws: list[list] = None,
+        batch_size: int = 16,
+        schedule: str = "cosine",
+        lr: float = 0.01,
+        niter: int = 500,
+        min_conf_thr: int = 3,
+    ) -> tuple[
+        list[np.ndarray], np.ndarray, np.ndarray, list[np.ndarray], list[np.ndarray]
+    ]:
+        num_img = len(img_paths)
+        if num_img == 1:
+            print("Only one image found, duplicating it to create a stereo pair.")
+            img_paths = img_paths * 2
+
+        images = self._load_images(img_paths, size=512)
+        pairs = self._make_pairs(
+            images,
+            scene_graph="complete",
+            prefilter=None,
+            symmetrize=True,
+        )
+        output = self._inference(pairs, self.model, self.device, batch_size=batch_size)
+
+        ori_imgs = [iio.imread(p) for p in img_paths]
+        ori_img_whs = np.array([img.shape[1::-1] for img in ori_imgs])
+        img_whs = np.concatenate([image["true_shape"][:, ::-1] for image in images], 0)
+
+        scene = self._global_aligner(
+            output,
+            device=self.device,
+            mode=self._GlobalAlignerMode.PointCloudOptimizer,
+            same_focals=True,
+            optimize_pp=False,  # True,
+            min_conf_thr=min_conf_thr,
+        )
+
+        # if Ks is not None:
+        #     scene.preset_focal(
+        #         torch.tensor([[K[0, 0], K[1, 1]] for K in Ks])
+        #     )
+
+        if c2ws is not None:
+            scene.preset_pose(c2ws)
+
+        _ = scene.compute_global_alignment(
+            init="msp", niter=niter, schedule=schedule, lr=lr
+        )
+
+        imgs = cast(list, scene.imgs)
+        Ks = scene.get_intrinsics().detach().cpu().numpy().copy()
+        c2ws = scene.get_im_poses().detach().cpu().numpy()  # type: ignore
+        pts3d = [x.detach().cpu().numpy() for x in scene.get_pts3d()]  # type: ignore
+        if num_img > 1:
+            masks = [x.detach().cpu().numpy() for x in scene.get_masks()]
+            points = [p[m] for p, m in zip(pts3d, masks)]
+            point_colors = [img[m] for img, m in zip(imgs, masks)]
+        else:
+            points = [p.reshape(-1, 3) for p in pts3d]
+            point_colors = [img.reshape(-1, 3) for img in imgs]
+
+        # Convert back to the original image size.
+        imgs = ori_imgs
+        Ks[:, :2, -1] *= ori_img_whs / img_whs
+        Ks[:, :2, :2] *= (ori_img_whs / img_whs).mean(axis=1, keepdims=True)[..., None]
+
+        return imgs, Ks, c2ws, points, point_colors
diff --git a/modeling/modules/transformer.py b/modeling/modules/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..09f19857f3d509e9e483fd8e6f6ad911494919d0
--- /dev/null
+++ b/modeling/modules/transformer.py
@@ -0,0 +1,248 @@
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import nn
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from typing import Union
+
+
+class GEGLU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+
+
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Union[int, None] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out or dim
+        self.net = nn.Sequential(
+            GEGLU(dim, inner_dim), nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        query_dim: int,
+        context_dim: Union[int, None] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.heads = heads
+        self.dim_head = dim_head
+        inner_dim = dim_head * heads
+        context_dim = context_dim or query_dim
+
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+
+    def forward(
+        self, x: torch.Tensor, context: Union[torch.Tensor, None] = None
+    ) -> torch.Tensor:
+        q = self.to_q(x)
+        context = context if context is not None else x
+        k = self.to_k(context)
+        v = self.to_v(context)
+        q, k, v = map(
+            lambda t: rearrange(t, "b l (h d) -> b h l d", h=self.heads),
+            (q, k, v),
+        )
+        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+            out = F.scaled_dot_product_attention(q, k, v)
+        out = rearrange(out, "b h l d -> b l (h d)")
+        out = self.to_out(out)
+        return out
+
+
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        d_head: int,
+        context_dim: int,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.attn1 = Attention(
+            query_dim=dim,
+            context_dim=None,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+        )
+        self.ff = FeedForward(dim, dropout=dropout)
+        self.attn2 = Attention(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+        )
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+
+    def forward(self, x: torch.Tensor, context: torch.Tensor) -> torch.Tensor:
+        x = self.attn1(self.norm1(x)) + x
+        x = self.attn2(self.norm2(x), context=context) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+
+
+class TransformerBlockTimeMix(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        d_head: int,
+        context_dim: int,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        inner_dim = n_heads * d_head
+        self.norm_in = nn.LayerNorm(dim)
+        self.ff_in = FeedForward(dim, dim_out=inner_dim, dropout=dropout)
+        self.attn1 = Attention(
+            query_dim=inner_dim,
+            context_dim=None,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+        )
+        self.ff = FeedForward(inner_dim, dim_out=dim, dropout=dropout)
+        self.attn2 = Attention(
+            query_dim=inner_dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+        )
+        self.norm1 = nn.LayerNorm(inner_dim)
+        self.norm2 = nn.LayerNorm(inner_dim)
+        self.norm3 = nn.LayerNorm(inner_dim)
+
+    def forward(
+        self, x: torch.Tensor, context: torch.Tensor, num_frames: int
+    ) -> torch.Tensor:
+        _, s, _ = x.shape
+        x = rearrange(x, "(b t) s c -> (b s) t c", t=num_frames)
+        x = self.ff_in(self.norm_in(x)) + x
+        x = self.attn1(self.norm1(x), context=None) + x
+        x = self.attn2(self.norm2(x), context=context) + x
+        x = self.ff(self.norm3(x))
+        x = rearrange(x, "(b s) t c -> (b t) s c", s=s)
+        return x
+
+
+class SkipConnect(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self, x_spatial: torch.Tensor, x_temporal: torch.Tensor
+    ) -> torch.Tensor:
+        return x_spatial + x_temporal
+
+
+class MultiviewTransformer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        n_heads: int,
+        d_head: int,
+        name: str,
+        unflatten_names: list[str] = [],
+        depth: int = 1,
+        context_dim: int = 1024,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.name = name
+        self.unflatten_names = unflatten_names
+
+        inner_dim = n_heads * d_head
+        self.norm = nn.GroupNorm(32, in_channels, eps=1e-6)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                TransformerBlock(
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    context_dim=context_dim,
+                    dropout=dropout,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+        self.time_mixer = SkipConnect()
+        self.time_mix_blocks = nn.ModuleList(
+            [
+                TransformerBlockTimeMix(
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    context_dim=context_dim,
+                    dropout=dropout,
+                )
+                for _ in range(depth)
+            ]
+        )
+
+    def forward(
+        self, x: torch.Tensor, context: torch.Tensor, num_frames: int
+    ) -> torch.Tensor:
+        assert context.ndim == 3
+        _, _, h, w = x.shape
+        x_in = x
+
+        time_context = context
+        time_context_first_timestep = time_context[::num_frames]
+        time_context = repeat(
+            time_context_first_timestep, "b ... -> (b n) ...", n=h * w
+        )
+
+        if self.name in self.unflatten_names:
+            context = context[::num_frames]
+
+        x = self.norm(x)
+        x = rearrange(x, "b c h w -> b (h w) c")
+        x = self.proj_in(x)
+
+        for block, mix_block in zip(self.transformer_blocks, self.time_mix_blocks):
+            if self.name in self.unflatten_names:
+                x = rearrange(x, "(b t) (h w) c -> b (t h w) c", t=num_frames, h=h, w=w)
+            x = block(x, context=context)
+            if self.name in self.unflatten_names:
+                x = rearrange(x, "b (t h w) c -> (b t) (h w) c", t=num_frames, h=h, w=w)
+            x_mix = mix_block(x, context=time_context, num_frames=num_frames)
+            x = self.time_mixer(x_spatial=x, x_temporal=x_mix)
+
+        x = self.proj_out(x)
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
+        out = x + x_in
+        return out
diff --git a/modeling/network.py b/modeling/network.py
new file mode 100644
index 0000000000000000000000000000000000000000..59e025ba0f8d0239cca6deb68d92c6abbd0bcfc1
--- /dev/null
+++ b/modeling/network.py
@@ -0,0 +1,236 @@
+from dataclasses import dataclass, field
+
+import torch
+import torch.nn as nn
+
+from typing import Union
+
+from modeling.modules.layers import (
+    Downsample,
+    GroupNorm32,
+    ResBlock,
+    TimestepEmbedSequential,
+    Upsample,
+    timestep_embedding,
+)
+from modeling.modules.transformer import MultiviewTransformer
+
+
+@dataclass
+class VMemModelParams(object):
+    in_channels: int = 11
+    model_channels: int = 320
+    out_channels: int = 4
+    num_frames: int = 8
+    num_res_blocks: int = 2
+    attention_resolutions: list[int] = field(default_factory=lambda: [4, 2, 1])
+    channel_mult: list[int] = field(default_factory=lambda: [1, 2, 4, 4])
+    num_head_channels: int = 64
+    transformer_depth: list[int] = field(default_factory=lambda: [1, 1, 1, 1])
+    context_dim: int = 1024
+    dense_in_channels: int = 6
+    dropout: float = 0.0
+    unflatten_names: list[str] = field(
+        default_factory=lambda: ["middle_ds8", "output_ds4", "output_ds2"]
+    )
+
+    def __post_init__(self):
+        assert len(self.channel_mult) == len(self.transformer_depth)
+
+
+class VMemModel(nn.Module):
+    def __init__(self, params: VMemModelParams) -> None:
+        super().__init__()
+        self.params = params
+        self.model_channels = params.model_channels
+        self.out_channels = params.out_channels
+        self.num_head_channels = params.num_head_channels
+
+        time_embed_dim = params.model_channels * 4
+        self.time_embed = nn.Sequential(
+            nn.Linear(params.model_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim),
+        )
+
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    nn.Conv2d(params.in_channels, params.model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self._feature_size = params.model_channels
+        input_block_chans = [params.model_channels]
+        ch = params.model_channels
+        ds = 1
+        for level, mult in enumerate(params.channel_mult):
+            for _ in range(params.num_res_blocks):
+                input_layers: list[Union[ResBlock, MultiviewTransformer, Downsample]] = [
+                    ResBlock(
+                        channels=ch,
+                        emb_channels=time_embed_dim,
+                        out_channels=mult * params.model_channels,
+                        dense_in_channels=params.dense_in_channels,
+                        dropout=params.dropout,
+                    )
+                ]
+                ch = mult * params.model_channels
+                if ds in params.attention_resolutions:
+                    num_heads = ch // params.num_head_channels
+                    dim_head = params.num_head_channels
+                    input_layers.append(
+                        MultiviewTransformer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            name=f"input_ds{ds}",
+                            depth=params.transformer_depth[level],
+                            context_dim=params.context_dim,
+                            unflatten_names=params.unflatten_names,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*input_layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(params.channel_mult) - 1:
+                ds *= 2
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(Downsample(ch, out_channels=out_ch))
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                self._feature_size += ch
+
+        num_heads = ch // params.num_head_channels
+        dim_head = params.num_head_channels
+
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                channels=ch,
+                emb_channels=time_embed_dim,
+                out_channels=None,
+                dense_in_channels=params.dense_in_channels,
+                dropout=params.dropout,
+            ),
+            MultiviewTransformer(
+                ch,
+                num_heads,
+                dim_head,
+                name=f"middle_ds{ds}",
+                depth=params.transformer_depth[-1],
+                context_dim=params.context_dim,
+                unflatten_names=params.unflatten_names,
+            ),
+            ResBlock(
+                channels=ch,
+                emb_channels=time_embed_dim,
+                out_channels=None,
+                dense_in_channels=params.dense_in_channels,
+                dropout=params.dropout,
+            ),
+        )
+        self._feature_size += ch
+
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(params.channel_mult))[::-1]:
+            for i in range(params.num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                output_layers: list[Union[ResBlock, MultiviewTransformer, Upsample]] = [
+                    ResBlock(
+                        channels=ch + ich,
+                        emb_channels=time_embed_dim,
+                        out_channels=params.model_channels * mult,
+                        dense_in_channels=params.dense_in_channels,
+                        dropout=params.dropout,
+                    )
+                ]
+                ch = params.model_channels * mult
+                if ds in params.attention_resolutions:
+                    num_heads = ch // params.num_head_channels
+                    dim_head = params.num_head_channels
+
+                    output_layers.append(
+                        MultiviewTransformer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            name=f"output_ds{ds}",
+                            depth=params.transformer_depth[level],
+                            context_dim=params.context_dim,
+                            unflatten_names=params.unflatten_names,
+                        )
+                    )
+                if level and i == params.num_res_blocks:
+                    out_ch = ch
+                    ds //= 2
+                    output_layers.append(Upsample(ch, out_ch))
+                self.output_blocks.append(TimestepEmbedSequential(*output_layers))
+                self._feature_size += ch
+
+        self.out = nn.Sequential(
+            GroupNorm32(32, ch),
+            nn.SiLU(),
+            nn.Conv2d(self.model_channels, params.out_channels, 3, padding=1),
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        y: torch.Tensor,
+        dense_y: torch.Tensor,
+        num_frames: Union[int, None] = None,
+    ) -> torch.Tensor:
+        num_frames = num_frames or self.params.num_frames
+        t_emb = timestep_embedding(t, self.model_channels)
+        t_emb = self.time_embed(t_emb)
+
+        hs = []
+        h = x
+        for module in self.input_blocks:
+            h = module(
+                h,
+                emb=t_emb,
+                context=y,
+                dense_emb=dense_y,
+                num_frames=num_frames,
+            )
+            hs.append(h)
+        h = self.middle_block(
+            h,
+            emb=t_emb,
+            context=y,
+            dense_emb=dense_y,
+            num_frames=num_frames,
+        )
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(
+                h,
+                emb=t_emb,
+                context=y,
+                dense_emb=dense_y,
+                num_frames=num_frames,
+            )
+        h = h.type(x.dtype)
+        return self.out(h)
+
+
+class VMemWrapper(nn.Module):
+    def __init__(self, module: VMemModel):
+        super().__init__()
+        self.module = module
+
+    def forward(
+        self, x: torch.Tensor, t: torch.Tensor, c: dict, **kwargs
+    ) -> torch.Tensor:
+        x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=1)
+        return self.module(
+            x,
+            t=t,
+            y=c["crossattn"],
+            dense_y=c["dense_vector"],
+            **kwargs,
+        )
diff --git a/modeling/pipeline.py b/modeling/pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..43d55f9af4bd6482a829126263e5c604c5804210
--- /dev/null
+++ b/modeling/pipeline.py
@@ -0,0 +1,1454 @@
+from typing import List, Union
+from copy import deepcopy
+
+import math
+
+# import matplotlib.pyplot as plt
+# from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+
+import PIL
+from PIL import Image, ImageOps
+import numpy as np
+from einops import repeat
+# from scipy.spatial import cKDTree
+
+import torch
+import torch.nn.functional as F
+from torch.amp import autocast
+import torchvision.transforms as tvf
+
+
+# from diffusers import AutoencoderKL, DiffusionPipeline
+# from diffusers.schedulers import DDIMScheduler
+from diffusers.utils import export_to_gif
+
+import sys
+# Add CUT3R to Python path for imports
+sys.path.append("./extern/CUT3R")
+from extern.CUT3R.surfel_inference import run_inference_from_pil
+from extern.CUT3R.add_ckpt_path import add_path_to_dust3r
+from extern.CUT3R.src.dust3r.model import ARCroco3DStereo
+
+from modeling import VMemWrapper, VMemModel, VMemModelParams
+from modeling.modules.autoencoder import AutoEncoder
+from modeling.sampling import DDPMDiscretization, DiscreteDenoiser, create_samplers
+from modeling.modules.conditioner import CLIPConditioner
+from utils import (encode_vae_image, 
+                   encode_image, 
+                   visualize_depth, 
+                   visualize_surfels, 
+                   tensor_to_pil,
+                   Octree, 
+                   Surfel,
+                   get_plucker_coordinates,
+                   do_sample,
+                   average_camera_pose)
+from utils.training_utils import load_pretrained_model
+
+
+
+
+
+
+ImgNorm = tvf.Compose([tvf.ToTensor(), tvf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+
+
+class VMemPipeline:
+    def __init__(self, config, device="cpu", dtype=torch.float32):
+        self.config = config
+        
+        model_path = self.config.model.get("model_path", None)
+        if model_path is None:
+            self.model = load_pretrained_model(cache_dir=self.config.model.cache_dir, device=device)
+        else:
+            self.model = VMemModel(VMemModelParams()).to(device, dtype)
+            # load from huggingface
+            from huggingface_hub import hf_hub_download
+            state_dict = torch.load(hf_hub_download(repo_id=model_path, filename="vmem_weights.pth"), map_location='cpu')
+            state_dict = {k.replace("module.", "") if "module." in k else k: v for k, v in state_dict.items()}
+                    
+            
+            self.model.load_state_dict(state_dict, strict=True)
+     
+        
+        self.model_wrapper = VMemWrapper(self.model)
+        self.model_wrapper.eval()
+
+        
+        self.vae = AutoEncoder(chunk_size=1).to(device, dtype)
+        self.vae.eval()
+        self.image_encoder = CLIPConditioner().to(device, dtype)
+        self.image_encoder.eval()
+        
+        self.discretization = DDPMDiscretization()
+        self.denoiser = DiscreteDenoiser(discretization=self.discretization, num_idx=1000, device=device)
+        self.sampler = create_samplers(guider_types=config.model.guider_types,
+                                discretization=self.discretization,
+                                num_frames=config.model.num_frames,
+                                num_steps=config.model.inference_num_steps,
+                                cfg_min=config.model.cfg_min,
+                                device=device)
+
+        
+                
+        self.dtype = dtype
+        self.device = device
+        
+
+        self.use_surfel = self.config.surfel.use_surfel
+        if self.use_surfel:
+            # Initialize CUT3R-based reconstructor
+            # Load and prepare the model
+            surfel_model_path = self.config.surfel.model_path
+            print(f"Loading model from {surfel_model_path}...")
+            add_path_to_dust3r(surfel_model_path)
+            self.surfel_model = ARCroco3DStereo.from_pretrained(surfel_model_path).to(device)
+            self.surfel_model.eval()
+            
+            # Import CUT3R scene alignment module
+            from extern.CUT3R.cloud_opt.dust3r_opt import global_aligner, GlobalAlignerMode
+            self.GlobalAlignerMode = GlobalAlignerMode
+            self.global_aligner = global_aligner
+            
+            
+
+        else:
+            self.surfel_model = None
+
+            
+        
+        self.temporal_only = self.config.model.temporal_only
+        self.use_non_maximum_suppression = self.config.model.use_non_maximum_suppression
+        
+        self.context_num_frames = self.config.model.context_num_frames
+        self.target_num_frames = self.config.model.target_num_frames
+        
+        self.original_height = self.config.model.original_height
+        self.original_width = self.config.model.original_width
+        self.height = self.config.model.height
+        self.width = self.config.model.width
+        
+        self.w_ratio = self.width / self.original_width
+        self.h_ratio = self.height / self.original_height
+        
+        self.camera_scale = self.config.model.camera_scale
+        
+        self.latents = []
+        self.encoder_embeddings = []
+        self.poses = []
+        self.Ks = []
+        self.surfel_Ks = []
+        self.surfels = []
+
+        self.surfel_depths = []
+        self.surfel_to_timestep = {}
+        self.pil_frames = []
+        self.visualize_dir = self.config.model.samples_dir
+        
+        self.global_step = 0
+       
+
+    def reset(self):
+        self.rgb_vae_latents = []
+        self.rgb_encoder_embeddings = []
+        self.poses = []
+        self.focal_lengths = []
+        self.surfels = []
+        self.surfel_Ks = []
+        self.surfel_depths = []
+        self.Ks = []
+        self.surfel_to_timestep = {}
+        self.all_pil_frames = []
+        self.global_step = 0
+
+    
+    def initialize(self, image, c2w, K):
+        """
+        Initialize the pipeline with a single image and camera parameters.
+        This method sets up internal state without generating additional frames.
+        
+        Args:
+            image: Tensor of input image [1, C, H, W]
+            c2w: Camera-to-world matrix (4x4)
+            K: Camera intrinsic matrix
+            
+        Returns:
+            PIL image of the initial frame
+        """
+        # Reset internal state
+        self.reset()
+        
+        # Process the image
+        if isinstance(image, torch.Tensor):
+            image_tensor = image
+        else:
+            # Convert to tensor if it's not already (fallback)
+            image_tensor = torch.from_numpy(np.array(image)).permute(2, 0, 1).float() / 127.5 - 1.0
+            image_tensor = image_tensor.unsqueeze(0).to(self.device, self.dtype)
+        
+        # Encode the image to VAE latents
+        self.latents = [encode_vae_image(image_tensor, self.vae, self.device, self.dtype).detach().cpu().numpy()[0]]
+        
+        # Encode the image embeddings for the image_encoder
+        self.encoder_embeddings = [encode_image(image_tensor, self.image_encoder, self.device, self.dtype).detach().cpu().numpy()[0]]
+        
+        # Store camera pose and intrinsics
+        self.c2ws = [c2w]
+        self.Ks = [K]
+        
+        # Convert to PIL and store
+        pil_frame = tensor_to_pil(image_tensor)
+        self.pil_frames = [pil_frame]
+        
+
+        
+        return pil_frame
+    
+    def geodesic_distance(self,
+                        camera_pose1,
+                        camera_pose2,
+                        weight_translation=1,):
+        """
+        Computes the geodesic distance between two camera poses in SE(3).
+        
+        Parameters:
+            extrinsic1 (torch.Tensor): 4x4 extrinsic matrix of the first pose.
+            extrinsic2 (torch.Tensor): 4x4 extrinsic matrix of the second pose.
+
+        Returns:
+            float: Geodesic distance between the two poses.
+        """
+        # Extract the rotation and translation components
+        R1 = camera_pose1[:3, :3]
+        t1 = camera_pose1[:3, 3]
+        R2 = camera_pose2[:3, :3]
+        t2 = camera_pose2[:3, 3]
+
+        # Compute the translation distance (Euclidean distance)
+        translation_distance = torch.norm(t1 - t2)
+        
+        # Compute the relative rotation matrix
+        R_relative = torch.matmul(R1.T, R2)
+        
+        # Compute the angular distance from the trace of the relative rotation matrix
+        trace_value = torch.trace(R_relative)
+        # Clamp the trace value to avoid numerical issues
+        trace_value = torch.clamp(trace_value, -1.0, 3.0)
+        angular_distance = torch.acos((trace_value - 1) / 2)
+        
+        # Combine the two distances
+        geodesic_dist = translation_distance*weight_translation + angular_distance
+            
+        return geodesic_dist
+    
+    def render_surfels_to_image(
+        self,
+        surfels,
+        poses,
+        focal_lengths,
+        principal_points,
+        image_width,
+        image_height,
+        disk_resolution=16
+    ):
+        """
+        Renders oriented surfels into a 2D RGB image with a simple z-buffer.
+        Each surfel is treated as a 2D disk in 3D, oriented by its normal.
+        The disk is approximated by a polygon of 'disk_resolution' segments.
+
+        Args:
+            surfels (list): List of Surfel objects, each having:
+                - position: (x, y, z) in world coords
+                - normal:   (nx, ny, nz)
+                - radius:   float, radius in world units
+            poses (torch.Tensor): Tensor of poses, shape [4, 4]
+            focal_lengths (torch.Tensor): Tensor of focal lengths, shape [2]
+            principal_points (torch.Tensor): Tensor of principal points, shape [2]
+            image_width, image_height (int): output image size
+            disk_resolution (int): number of segments for approximating each disk
+
+        Returns:
+            Dictionary containing:
+            - depth: depth map
+            - surfel_index_map: map of surfel indices
+            - cos_value_map: map of cosine values between view and normal directions
+        """
+        if isinstance(focal_lengths, torch.Tensor):
+            focal_lengths = focal_lengths.detach().cpu().numpy()
+        if isinstance(principal_points, torch.Tensor):
+            principal_points = principal_points.detach().cpu().numpy()
+        if isinstance(poses, torch.Tensor):
+            poses = poses.detach().cpu().numpy()
+
+        # Initialize buffers
+        surfel_index_map = np.full((image_height, image_width), -1, dtype=np.int32)
+        z_buffer = np.full((image_height, image_width), np.inf, dtype=np.float32)
+        cos_buffer = np.zeros((image_height, image_width), dtype=np.float32)
+
+        # Unpack camera parameters
+        fx, fy, cx, cy = focal_lengths[0], focal_lengths[1], principal_points[0], principal_points[1]
+        R = poses[0:3, 0:3]
+        t = poses[0:3, 3]
+        
+        # Compute view frustum planes in world space
+        # We'll use 6 planes: near, far, left, right, top, bottom
+        near_z = 0.1  # Near plane distance
+        far_z = 1000.0  # Far plane distance
+        
+        # Convert all surfel positions to camera space at once for efficient culling
+        positions = np.array([s.position for s in surfels])
+        positions_h = np.concatenate([positions, np.ones((len(positions), 1))], axis=1)
+        
+        # Compute camera matrix
+        extrinsics = np.zeros((4, 4))
+        extrinsics[0:3, 0:3] = np.linalg.inv(R)
+        extrinsics[0:3, 3] = -np.linalg.inv(R) @ t
+        extrinsics[3, 3] = 1
+        
+        # Transform all points to camera space at once
+        cam_points = (extrinsics @ positions_h.T).T
+        cam_points = cam_points[:, :3] / cam_points[:, 3:]
+        
+        # Compute view frustum culling mask
+        in_front = cam_points[:, 2] > near_z
+        behind_far = cam_points[:, 2] < far_z
+        
+        # Project points to get screen coordinates
+        screen_x = fx * (cam_points[:, 0] / cam_points[:, 2]) + cx
+        screen_y = fy * (cam_points[:, 1] / cam_points[:, 2]) + cy
+        
+        # Check which points are within screen bounds (with some margin for surfel radius)
+        margin = 50  # Margin in pixels to account for surfel radius
+        in_screen_x = (screen_x >= -margin) & (screen_x < image_width + margin)
+        in_screen_y = (screen_y >= -margin) & (screen_y < image_height + margin)
+        
+        # Combine all culling masks
+        visible_mask = in_front & behind_far & in_screen_x & in_screen_y
+        visible_indices = np.where(visible_mask)[0]
+
+        def point_in_polygon_2d(px, py, polygon):
+            """Fast point-in-polygon test using ray casting"""
+            inside = False
+            n = len(polygon)
+            j = n - 1
+            for i in range(n):
+                if (((polygon[i][1] > py) != (polygon[j][1] > py)) and
+                    (px < (polygon[j][0] - polygon[i][0]) * (py - polygon[i][1]) /
+                     (polygon[j][1] - polygon[i][1] + 1e-15) + polygon[i][0])):
+                    inside = not inside
+                j = i
+            return inside
+
+        # Pre-compute angle samples for circle approximation
+        angles = np.linspace(0, 2*math.pi, disk_resolution, endpoint=False)
+        cos_angles = np.cos(angles)
+        sin_angles = np.sin(angles)
+
+        # Process only visible surfels
+        for idx in visible_indices:
+            surfel = surfels[idx]
+            px, py, pz = surfel.position
+            nx, ny, nz = surfel.normal
+            radius = surfel.radius
+
+            # Skip degenerate normals
+            normal = np.array([nx, ny, nz], dtype=float)
+            norm_len = np.linalg.norm(normal)
+            if norm_len < 1e-12:
+                continue
+            normal /= norm_len
+
+            # Compute view direction and cosine value
+            point_direction = (px, py, pz) - t
+            point_direction = point_direction / np.linalg.norm(point_direction)
+            cos_value = np.dot(point_direction, normal)
+
+            # Skip backfaces
+            if cos_value < 0:
+                continue
+
+            # Build local coordinate frame
+            up = np.array([0, 0, 1], dtype=float)
+            if abs(np.dot(normal, up)) > 0.9:
+                up = np.array([0, 1, 0], dtype=float)
+            xAxis = np.cross(normal, up)
+            xAxis /= np.linalg.norm(xAxis)
+            yAxis = np.cross(normal, xAxis)
+            yAxis /= np.linalg.norm(yAxis)
+
+            # Generate circle points efficiently
+            offsets = radius * (cos_angles[:, None] * xAxis + sin_angles[:, None] * yAxis)
+            circle_points = positions[idx] + offsets
+
+            # Project all circle points at once
+            circle_points_h = np.concatenate([circle_points, np.ones((len(circle_points), 1))], axis=1)
+            cam_circle = (extrinsics @ circle_points_h.T).T
+            depths = cam_circle[:, 2]
+            valid_mask = depths > 0
+            if not np.any(valid_mask):
+                continue
+
+            screen_points = np.zeros((len(circle_points), 2))
+            screen_points[:, 0] = fx * (cam_circle[:, 0] / depths) + cx
+            screen_points[:, 1] = fy * (cam_circle[:, 1] / depths) + cy
+            
+            # Get bounding box
+            valid_points = screen_points[valid_mask]
+            if len(valid_points) < 3:
+                continue
+
+            min_x = max(0, int(np.floor(np.min(valid_points[:, 0]))))
+            max_x = min(image_width - 1, int(np.ceil(np.max(valid_points[:, 0]))))
+            min_y = max(0, int(np.floor(np.min(valid_points[:, 1]))))
+            max_y = min(image_height - 1, int(np.ceil(np.max(valid_points[:, 1]))))
+
+            # Average depth for z-buffer
+            avg_depth = float(np.mean(depths[valid_mask]))
+
+            # Rasterize polygon
+            for py_ in range(min_y, max_y + 1):
+                for px_ in range(min_x, max_x + 1):
+                    if point_in_polygon_2d(px_, py_, valid_points):
+                        if avg_depth < z_buffer[py_, px_]:
+                            z_buffer[py_, px_] = avg_depth
+                            surfel_index_map[py_, px_] = idx
+                            cos_buffer[py_, px_] = cos_value
+
+        # Clean up depth buffer
+        depth = z_buffer
+        depth[depth == np.inf] = 0
+
+        return {
+            "depth": depth,
+            "surfel_index_map": surfel_index_map,
+            "cos_value_map": cos_buffer
+        }
+    
+    def get_frame_distribution(self,
+                               n, 
+                               ratios):
+        """
+        Given:
+        - an integer n,
+        - a list of k ratios whose sum is 1 (k <= n),
+        return a list of k integers [x1, x2, ..., xk],
+        such that each xi >= 1, sum(xi) = n, and
+        the xi are as proportional to ratios as possible.
+        """
+        k = len(ratios)
+        if k > n:
+            # set the top n ratios to 1
+            result = [0] * k
+            sort_indices = np.argsort(ratios)[::-1]
+            for sort_index in sort_indices[:n]:
+                result[sort_index] = 1
+            return result
+
+        # 1. Reserve 1 for each ratio
+        result = [1] * k
+
+        # 2. Distribute the leftover among the k ratios proportionally
+        leftover = n - k
+        if leftover == 0:
+            # If n == k, each ratio just gets 1
+            return result
+
+        # Compute products for leftover distribution
+        products = [r * leftover for r in ratios]
+        floored = [int(p // 1) for p in products]  # floor of each product
+
+        sum_floors = sum(floored)
+        leftover2 = leftover - sum_floors  # how many units still to distribute
+
+        # Add the floored part to the result
+        for i in range(k):
+            result[i] += floored[i]
+
+        # Sort by the fractional remainder, descending
+        remainders = [(p - f, i) for i, (p, f) in enumerate(zip(products, floored))]
+        remainders.sort(key=lambda x: x[0], reverse=True)
+
+        # Distribute the leftover2 among the largest fractional remainders
+        for j in range(leftover2):
+            _, idx = remainders[j]
+            result[idx] = 1
+
+        return result
+    
+    def process_retrieved_spatial_information(self, retrieved_spatial_information):
+        
+        timestep_count = {} 
+  
+        surfel_index_map = retrieved_spatial_information["surfel_index_map"]
+        cos_value_map = retrieved_spatial_information["cos_value_map"]
+        depth_map = retrieved_spatial_information["depth"]
+        filtered_cos_value = cos_value_map[surfel_index_map >= 0]
+        filtered_surfel_index = surfel_index_map[surfel_index_map >= 0]
+        filtered_depth = depth_map[surfel_index_map >= 0]
+        assert len(filtered_cos_value) == len(filtered_surfel_index), "filtered_cos_value and filtered_surfel_index should have the same length"
+        for j in range(len(filtered_surfel_index)):
+            cos_value = filtered_cos_value[j]
+            depth_value = filtered_depth[j]
+            if cos_value < 0:
+                continue
+            surfel_index = filtered_surfel_index[j]
+            timesteps = self.surfel_to_timestep[surfel_index]
+   
+            for timestep in timesteps:
+                
+                if timestep not in timestep_count:
+                    timestep_count[timestep] = cos_value/(1+depth_value)
+                timestep_count[timestep] += cos_value/(1+depth_value)
+            
+
+
+        timestep_count_values = np.array(list(timestep_count.values()))
+        timestep_count_ratios = timestep_count_values / np.sum(timestep_count_values)
+        timestep_weights = {k: timestep_count_ratios[i] for i, k in enumerate(timestep_count)}
+        num_retrieved_frames = min(self.config.model.context_num_frames+10, len(timestep_weights))
+        frame_count = self.get_frame_distribution(num_retrieved_frames, list(timestep_weights.values())) # hard code
+        frame_count = {k: int(v) for k, v in zip(timestep_count.keys(), frame_count)}
+        
+        # sort timestep_weights and frame_distribution by timestep without 
+        timestep_weights = sorted(timestep_weights.items(), key=lambda x: x[0])
+        frame_count = sorted(frame_count.items(), key=lambda x: x[0])
+        
+        
+
+        return timestep_weights, frame_count
+    
+    
+    def get_context_info(self, target_c2ws, use_non_maximum_suppression=None):
+        """Get context information for novel view synthesis.
+        
+        Args:
+            target_c2ws: Target camera-to-world matrices
+            Ks: Camera intrinsic matrices
+            current_timestep: Current timestep (used in temporal mode)
+            
+        Returns:
+            Dictionary containing context information for the target view
+        """
+        # Function to prepare context tensors from indices
+        def prepare_context_data(indices):
+            c2ws = [self.c2ws[i] for i in indices]
+            latents = [torch.from_numpy(self.latents[i]).to(self.device, self.dtype) for i in indices]
+            embeddings = [torch.from_numpy(self.encoder_embeddings[i]).to(self.device, self.dtype) for i in indices]
+            intrinsics = [self.Ks[i] for i in indices]
+            return c2ws, latents, embeddings, intrinsics, indices
+        
+        if self.temporal_only:
+            # Select frames based on timesteps (temporal mode)
+            context_time_indices = [len(self.c2ws) - 1 - i for i in range(self.config.model.context_num_frames) if len(self.c2ws) - 1 - i >= 0]
+            context_data = prepare_context_data(context_time_indices)
+        
+        elif not self.use_surfel:
+            # Select frames based on camera pose distance with NMS
+            average_c2w = average_camera_pose(target_c2ws)
+            distances = torch.stack([self.geodesic_distance(torch.from_numpy(average_c2w).to(self.device, self.dtype), torch.from_numpy(np.array(c2w)).to(self.device, self.dtype), weight_translation=self.config.model.translation_distance_weight) 
+                         for c2w in self.c2ws])
+            
+            # Sort frames by distance (closest to target first)
+            sorted_indices = torch.argsort(distances)
+            max_frames = min(self.config.model.context_num_frames, len(distances), len(self.latents))
+            
+            # Apply non-maximum suppression to select diverse frames
+            is_first_step = len(self.pil_frames) <= 1
+            is_second_step = len(self.pil_frames) == 5
+            min_required_frames = 1 if is_first_step else max_frames
+            
+            # Adaptively determine initial threshold based on camera pose distribution
+            if use_non_maximum_suppression is None:
+                use_non_maximum_suppression = self.use_non_maximum_suppression
+                
+            if use_non_maximum_suppression:
+  
+                if is_second_step:
+                    # Calculate pairwise distances between existing frames
+                    pairwise_distances = []
+                    for i in range(len(self.c2ws)):
+                        for j in range(i+1, len(self.c2ws)):
+                            sim = self.geodesic_distance(
+                                torch.from_numpy(np.array(self.c2ws[i])).to(self.device, self.dtype),
+                                torch.from_numpy(np.array(self.c2ws[j])).to(self.device, self.dtype),
+                                weight_translation=self.config.model.translation_distance_weight
+                            )
+                            pairwise_distances.append(sim.item())
+                    
+                    if pairwise_distances:
+                        # Sort distances and take percentile as threshold
+                        pairwise_distances.sort()
+                        percentile_idx = int(len(pairwise_distances) * 0.5)  # 25th percentile
+                        self.initial_threshold = pairwise_distances[percentile_idx]
+                        
+                        # Ensure threshold is within reasonable bounds
+                        # initial_threshold = max(0.00, min(0.001, initial_threshold))
+                    else:
+                        self.initial_threshold = 0.001
+                elif is_first_step:
+                    # Default threshold for first frame
+                    self.initial_threshold = 1e8
+            else:
+                self.initial_threshold = 1e8
+                
+        
+            
+            selected_indices = []
+            
+            # Try with increasingly relaxed thresholds until we get enough frames
+            current_threshold = self.initial_threshold
+            while len(selected_indices) < min_required_frames and current_threshold <= 1.0:
+                # Reset selection with new threshold
+                selected_indices = []
+                
+                # Always start with the closest pose
+                selected_indices.append(sorted_indices[0])
+                
+                # Try to add each subsequent pose in order of distance
+                for idx in sorted_indices[1:]:
+                    if len(selected_indices) >= max_frames:
+                        break
+                        
+                    # Check if this candidate is sufficiently different from all selected frames
+                    is_too_similar = False
+                    for selected_idx in selected_indices:
+                        similarity = self.geodesic_distance(
+                            torch.from_numpy(np.array(self.c2ws[idx])).to(self.device, self.dtype),
+                            torch.from_numpy(np.array(self.c2ws[selected_idx])).to(self.device, self.dtype),
+                            weight_translation=self.config.model.translation_distance_weight
+                        )
+                        if similarity < current_threshold:
+                            is_too_similar = True
+                            break
+                            
+                    # Add to selected frames if not too similar to any existing selection
+                    if not is_too_similar:
+                        selected_indices.append(idx)
+                
+                # If we still don't have enough frames, relax the threshold and try again
+                if len(selected_indices) < min_required_frames:
+                    current_threshold *= 1.2
+                else:
+                    break
+            
+            # If we still don't have enough frames, just take the top frames by distance
+            if len(selected_indices) < min_required_frames:
+                available_indices = []
+                for idx in sorted_indices:
+                    if idx not in selected_indices:
+                        available_indices.append(idx)
+                selected_indices.extend(available_indices[:min_required_frames-len(selected_indices)])
+            
+            # Convert to tensor and maintain original order (don't reverse)
+            context_time_indices = torch.tensor(selected_indices, device=distances.device)
+            context_data = prepare_context_data(context_time_indices)
+        
+        else:
+            if len(self.pil_frames) == 1:
+                context_time_indices = [0]
+            else:
+                # get the average camera pose
+                average_c2w = average_camera_pose(target_c2ws[-self.config.model.context_num_frames//4:])
+                transformed_average_c2w = self.get_transformed_c2ws(average_c2w)
+                target_K = np.mean(self.surfel_Ks, axis=0)
+                # Select frames using surfel-based relevance
+                retrieved_info = self.render_surfels_to_image(
+                    self.surfels,
+                    transformed_average_c2w,
+                    [target_K*0.65] * 2,
+                    principal_points=(int(self.config.surfel.width/2), int(self.config.surfel.height/2)),
+                    image_width=int(self.config.surfel.width),
+                    image_height=int(self.config.surfel.height)
+                )
+                _, frame_count = self.process_retrieved_spatial_information(retrieved_info)
+                if self.config.inference.visualize:
+                    visualize_depth(retrieved_info["depth"],
+                                    visualization_dir=self.visualize_dir, 
+                                    file_name=f"retrieved_depth_surfels.png",
+                                    size=(self.width, self.height))
+                
+                
+                # Build candidate frames based on relevance count
+                candidates = []
+                for frame, count in frame_count:
+                    candidates.extend([frame] * count)
+                    indices_to_frame = {
+                        i: frame for i, frame in enumerate(candidates)
+                    }
+                    
+                # Sort candidates by distance to target view
+                distances = [self.geodesic_distance(torch.from_numpy(average_c2w).to(self.device, self.dtype), 
+                                                    torch.from_numpy(self.c2ws[frame]).to(self.device, self.dtype), 
+                                                    weight_translation=self.config.model.translation_distance_weight).item() 
+                            for frame in candidates]
+                
+                sorted_indices = torch.argsort(torch.tensor(distances))
+                sorted_frames = [indices_to_frame[int(i.item())] for i in sorted_indices]
+                max_frames = min(self.config.model.context_num_frames, len(candidates), len(self.latents))
+                
+  
+                is_second_step = len(self.pil_frames) == 5
+       
+                
+                # Adaptively determine initial threshold based on camera pose distribution
+                if use_non_maximum_suppression is None:
+                    use_non_maximum_suppression = self.use_non_maximum_suppression
+                    
+                if use_non_maximum_suppression:
+                    if is_second_step:
+                        # Calculate pairwise distances between existing frames
+                        pairwise_distances = []
+                        for i in range(len(self.c2ws)):
+                            for j in range(i+1, len(self.c2ws)):
+                                sim = self.geodesic_distance(
+                                    torch.from_numpy(np.array(self.c2ws[i])).to(self.device, self.dtype),
+                                    torch.from_numpy(np.array(self.c2ws[j])).to(self.device, self.dtype),
+                                    weight_translation=self.config.model.translation_distance_weight
+                                )
+                                pairwise_distances.append(sim.item())
+                        
+                        if pairwise_distances:
+                            # Sort distances and take percentile as threshold
+                            pairwise_distances.sort()
+                            percentile_idx = int(len(pairwise_distances) * 0.5)  # 25th percentile
+                            self.initial_threshold = pairwise_distances[percentile_idx]
+                        else:
+                            self.initial_threshold = 1
+    
+                
+                 
+                else:
+                    self.initial_threshold = 1e8
+                
+                selected_indices = []
+                current_threshold = self.initial_threshold
+                
+                # Always start with the closest pose
+                selected_indices.append(sorted_frames[0])
+                if not use_non_maximum_suppression:
+                    selected_indices.append(len(self.c2ws) - 1)
+                
+                # Try with increasingly relaxed thresholds until we get enough frames
+                while len(selected_indices) < max_frames and current_threshold >= 1e-5 and use_non_maximum_suppression:
+                    # Try to add each subsequent pose in order of distance
+                    for idx in sorted_frames[1:]:
+                        if len(selected_indices) >= max_frames:
+                            break
+                            
+                        # Check if this candidate is sufficiently different from all selected frames
+                        is_too_similar = False
+                        for selected_idx in selected_indices:
+                            similarity = self.geodesic_distance(
+                                torch.from_numpy(np.array(self.c2ws[idx])).to(self.device, self.dtype),
+                                torch.from_numpy(np.array(self.c2ws[selected_idx])).to(self.device, self.dtype),
+                                weight_translation=self.config.model.translation_distance_weight
+                            )
+                            if similarity < current_threshold:
+                                is_too_similar = True
+                                break
+                                
+                        # Add to selected frames if not too similar to any existing selection
+                        if not is_too_similar:
+                            selected_indices.append(idx)
+                    
+                    # If we still don't have enough frames, relax the threshold and try again
+                    if len(selected_indices) < max_frames:
+                        current_threshold /= 1.2
+                    else:
+                        break
+                
+                # If we still don't have enough frames, just take the top frames by distance
+                if len(selected_indices) < max_frames:
+                    available_indices = []
+                    for idx in sorted_frames:
+                        if idx not in selected_indices:
+                            available_indices.append(idx)
+                    selected_indices.extend(available_indices[:max_frames-len(selected_indices)])
+                
+                # Convert to tensor and maintain original order (don't reverse)
+                context_time_indices = torch.from_numpy(np.array(selected_indices))
+            context_data = prepare_context_data(context_time_indices)
+            
+        (context_c2ws, context_latents, context_encoder_embeddings, context_Ks, context_time_indices) = context_data
+        print(f"context_time_indices: {context_time_indices}")
+            
+        return {
+            "context_c2ws": torch.from_numpy(np.array(context_c2ws)).to(self.device, self.dtype),
+            "context_latents": torch.stack(context_latents).to(self.device, self.dtype),
+            "context_encoder_embeddings": torch.stack(context_encoder_embeddings).to(self.device, self.dtype),
+            "context_Ks": torch.from_numpy(np.array(context_Ks)).to(self.device, self.dtype),
+            "context_time_indices": context_time_indices,
+        }
+
+
+        
+
+    def merge_surfels(
+        self,
+        new_surfels: list,
+        current_timestep: str,
+        existing_surfels: list,
+        existing_surfel_to_timestep: dict,
+        position_threshold: Union[float, None] = None,  # Now optional
+        normal_threshold: float = 0.7,
+        max_points_per_node: int = 10 
+    ):
+
+        assert len(existing_surfels) == len(existing_surfel_to_timestep), (
+            "existing_surfels and existing_surfel_to_timestep should have the same length"
+        )
+        
+        # Automatically calculate position threshold if not provided
+        if position_threshold is None:
+            # Calculate average radius from both new and existing surfels
+            all_radii = np.array([s.radius for s in existing_surfels + new_surfels])
+            if len(all_radii) > 0:
+                # Use mean radius as base threshold with a scaling factor
+                mean_radius = np.mean(all_radii)
+                std_radius = np.std(all_radii)
+                # Position threshold = mean + 0.5 * std to account for variance
+                position_threshold = mean_radius + 0.5 * std_radius
+            else:
+                # Fallback to default if no surfels available
+                position_threshold = 0.025
+
+        positions = np.array([s.position for s in existing_surfels])  # Shape: (N, 3)
+        normals = np.array([s.normal for s in existing_surfels])      # Shape: (N, 3)
+
+        if len(positions) > 0:
+            octree = Octree(positions, max_points=max_points_per_node)
+        else:
+            octree = None
+        
+
+        filtered_surfels = []
+        
+        merge_count = 0
+        for new_surfel in new_surfels:
+            is_merged = False
+            if octree is not None:
+                neighbor_indices = octree.query_ball_point(new_surfel.position, position_threshold)
+            else:
+                neighbor_indices = []
+            
+            for idx in neighbor_indices:
+                if np.dot(normals[idx], new_surfel.normal) > normal_threshold:
+                    if current_timestep not in existing_surfel_to_timestep[idx]:
+                        existing_surfel_to_timestep[idx].append(current_timestep)
+                    is_merged = True
+                    merge_count += 1
+                    break
+            
+            if not is_merged:
+                filtered_surfels.append(new_surfel)
+        
+        print(f"merge_count: {merge_count}")
+        return filtered_surfels, existing_surfel_to_timestep
+    
+    def pointmap_to_surfels(self,
+                            pointmap: torch.Tensor,
+                            focal_lengths: torch.Tensor,
+                            depths: torch.Tensor,
+                            confs: torch.Tensor,
+                            poses: torch.Tensor, # shape: (4, 4)
+                            radius_scale: float = 0.5,
+                            estimate_normals: bool = True):
+        """
+        Vectorized version of pointmap to surfels conversion.
+        All operations are performed on the specified device (self.device) until final numpy conversion.
+        """
+        if isinstance(poses, np.ndarray):
+            poses = torch.from_numpy(poses).to(self.device)
+        if isinstance(focal_lengths, np.ndarray):
+            focal_lengths = torch.from_numpy(focal_lengths).to(self.device)
+        if isinstance(depths, np.ndarray):
+            depths = torch.from_numpy(depths).to(self.device)
+        if isinstance(confs, np.ndarray):
+            confs = torch.from_numpy(confs).to(self.device)
+            
+        # Ensure all inputs are on the correct device
+        pointmap = pointmap.to(self.device)
+        focal_lengths = focal_lengths.to(self.device)
+        depths = depths.to(self.device)
+        confs = confs.to(self.device)
+        poses = poses.to(self.device)
+            
+        if len(focal_lengths) == 2:
+            focal_lengths = torch.mean(focal_lengths, dim=0)
+            
+        # 1) Estimate normals
+        if estimate_normals:
+            normal_map = self.estimate_normal_from_pointmap(pointmap)
+        else:
+            normal_map = torch.zeros_like(pointmap)
+            
+        # Create mask for valid points
+        # depth threshold is the 95 percentile of the depth map
+        depth_threshold = torch.quantile(depths, 0.999)
+        valid_mask = (depths <= depth_threshold) & (confs >= self.config.surfel.conf_thresh)
+        
+        # Get positions, normals and depths for valid points
+        positions = pointmap[valid_mask]  # [N, 3]
+        normals = normal_map[valid_mask]  # [N, 3]
+        valid_depths = depths[valid_mask]  # [N]
+        
+        # Calculate view directions for all valid points at once
+        camera_pos = poses[0:3, 3]
+        view_directions = positions - camera_pos.unsqueeze(0)  # [N, 3]
+        view_directions = F.normalize(view_directions, dim=1)  # [N, 3]
+        
+        # Calculate dot products between view directions and normals
+        dot_products = torch.sum(view_directions * normals, dim=1)  # [N]
+        
+        # Flip normals where needed
+        flip_mask = dot_products < 0
+        normals[flip_mask] = -normals[flip_mask]
+        
+        # Recalculate dot products with potentially flipped normals
+        dot_products = torch.abs(torch.sum(view_directions * normals, dim=1))  # [N]
+        
+        # Calculate adjustment values and radii
+        adjustment_values = 0.2 + 0.8 * dot_products  # [N]
+        radii = (radius_scale * valid_depths / focal_lengths / adjustment_values)  # [N]
+        
+        # Convert to numpy only at the end
+        positions = positions.detach().cpu().numpy()
+        normals = normals.detach().cpu().numpy()
+        radii = radii.detach().cpu().numpy()
+        
+        # Create surfels list using list comprehension
+        surfels = [Surfel(pos, norm, rad) for pos, norm, rad in zip(positions, normals, radii)]
+
+            
+        
+        return surfels
+
+    def estimate_normal_from_pointmap(self,pointmap: torch.Tensor) -> torch.Tensor:
+        h, w = pointmap.shape[:2]
+        device = pointmap.device  # Keep the device (CPU/GPU) consistent
+        dtype = pointmap.dtype
+        
+        # Initialize the normal map
+        normal_map = torch.zeros((h, w, 3), device=device, dtype=dtype)
+        
+        for y in range(h):
+            for x in range(w):
+                # Check if neighbors are within bounds
+                if x+1 >= w or y+1 >= h:
+                    continue
+                
+                p_center = pointmap[y, x]
+                p_right  = pointmap[y, x+1]
+                p_down   = pointmap[y+1, x]
+                
+                # Compute vectors
+                v1 = p_right - p_center
+                v2 = p_down - p_center
+                
+                v1 = v1 / torch.linalg.norm(v1)
+                v2 = v2 / torch.linalg.norm(v2)
+                
+                # Cross product in camera coordinates
+                n_c = torch.cross(v1, v2)
+                # n_c *= 1e10
+                
+                # Compute norm of the normal vector
+                norm_len = torch.linalg.norm(n_c)
+                
+                if norm_len < 1e-8:
+                    continue
+                
+                # Normalize and store
+                normal_map[y, x] = n_c / norm_len
+        
+        return normal_map
+
+    def get_transformed_c2ws(self, c2ws=None):
+        if c2ws is None:
+            c2ws = self.c2ws
+        c2ws_transformed = deepcopy(np.array(c2ws))
+        c2ws_transformed[..., :, [1, 2]] *= -1
+        return c2ws_transformed
+
+    def construct_and_store_scene(self, 
+            input_images: List[PIL.Image.Image],
+            time_indices,
+            niter = 1000,
+            lr = 0.01,
+            device = 'cuda',
+            ):
+        """
+        Constructs a scene from input images and stores the resulting surfels.
+
+        Args:
+            input_images: List of PIL images to process
+            time_indices: The time indices for each image
+            niter: Number of iterations for optimization
+            lr: Learning rate for optimization
+            device: Device to run inference on
+            only_last_frame: Whether to only process the last frame
+        """
+        # Flip Y and Z components of camera poses to match dataset convention
+        c2ws_transformed = self.get_transformed_c2ws()
+        
+        # Run inference to construct the scene
+        if self.global_step == 10:
+            visualize = True
+        else:
+            visualize = False
+        scene = run_inference_from_pil(
+            input_images,
+            self.surfel_model,
+            poses=c2ws_transformed,
+            depths=torch.from_numpy(np.array(self.surfel_depths)) if len(self.surfel_depths) > 0 else None,
+            lr = lr,
+            niter = niter,
+            # visualize=self.config.inference.visualize_pointcloud,
+            visualize=visualize,
+            device=device,
+        )
+
+        # Extract outputs
+        pointcloud = torch.cat(scene['point_clouds'], dim=0)
+        confs = torch.cat(scene['confidences'], dim=0)
+        depths = torch.cat(scene['depths'], dim=0)
+        focal_lengths = scene['camera_info']['focal']
+        self.surfel_Ks.extend([focal_lengths[i] for i in range(len(focal_lengths))])
+        self.surfel_depths = [depths[i].detach().cpu().numpy() for i in range(len(depths))]
+        # Resize pointcloud
+        pointcloud = pointcloud.permute(0, 3, 1, 2)
+        pointcloud = F.interpolate(
+            pointcloud, 
+            scale_factor=self.config.surfel.shrink_factor, 
+            mode='bilinear'
+        )
+        pointcloud = pointcloud.permute(0, 2, 3, 1)
+
+
+
+        depths = depths.unsqueeze(1)
+        depths = F.interpolate(
+            depths, 
+            scale_factor=self.config.surfel.shrink_factor, 
+            mode='bilinear'
+        )
+        depths = depths.squeeze(1)
+
+        confs = confs.unsqueeze(1)
+        confs = F.interpolate(
+            confs, 
+            scale_factor=self.config.surfel.shrink_factor, 
+            mode='bilinear'
+        )
+        confs = confs.squeeze(1)
+        
+        # self.surfels = []
+        # self.surfel_to_timestep = {}
+        start_idx = 0 if len(self.surfels) == 0 else len(pointcloud) - self.config.model.target_num_frames
+        end_idx = len(pointcloud)
+        # for frame_idx in range(len(pointcloud)):
+        # Create surfels for the current frame
+        for frame_idx in range(start_idx, end_idx):
+            surfels = self.pointmap_to_surfels(
+                pointmap=pointcloud[frame_idx],
+                focal_lengths=focal_lengths[frame_idx] * self.config.surfel.shrink_factor,
+                depths=depths[frame_idx],
+                confs=confs[frame_idx],
+                poses=c2ws_transformed[frame_idx],
+                estimate_normals=True,
+                radius_scale=self.config.surfel.radius_scale,
+            )
+
+            if len(self.surfels) > 0:
+                surfels, self.surfel_to_timestep = self.merge_surfels(
+                    new_surfels=surfels,
+                    current_timestep=frame_idx,
+                    existing_surfels=self.surfels,
+                    existing_surfel_to_timestep=self.surfel_to_timestep,
+                    # position_threshold=self.config.surfel.merge_position_threshold,
+                    normal_threshold=self.config.surfel.merge_normal_threshold
+                )
+
+
+            # Update timestep mapping
+            num_surfels = len(surfels)
+            surfel_start_index = len(self.surfels)
+            for surfel_index in range(num_surfels):
+                self.surfel_to_timestep[surfel_start_index + surfel_index] = [frame_idx]
+
+            # Save surfels if configured
+            if self.config.inference.save_surfels and len(self.surfels) > 0:
+                positions = np.array([s.position for s in surfels], dtype=np.float32)
+                normals   = np.array([s.normal   for s in surfels], dtype=np.float32)
+                radii     = np.array([s.radius   for s in surfels], dtype=np.float32)
+                colors    = np.array([s.color    for s in surfels], dtype=np.float32)
+
+                np.savez(f"{self.config.visualization_dir}/surfels_added.npz",
+                        positions=positions,
+                        normals=normals,
+                        radii=radii,
+                        colors=colors)
+                
+                positions = np.array([s.position for s in self.surfels], dtype=np.float32)
+                normals   = np.array([s.normal   for s in self.surfels], dtype=np.float32)
+                radii     = np.array([s.radius   for s in self.surfels], dtype=np.float32)
+                colors    = np.array([s.color    for s in self.surfels], dtype=np.float32)
+
+                np.savez(f"{self.config.visualization_dir}/surfels_original.npz",
+                        positions=positions,
+                        normals=normals,
+                        radii=radii,
+                        colors=colors)
+            
+            self.surfels.extend(surfels)
+        
+        if self.config.inference.visualize_surfel:
+            visualize_surfels(self.surfels, draw_normals=True, normal_scale=0.0003)
+
+ 
+    
+    def get_translation_scaling_factor(self, c2ws):
+        # camera centering
+        """
+        Args:
+            c2ws: camera-to-world matrices, shape: (N, 4, 4)
+
+        Returns:
+            translation_scaling_factor: translation scaling factor
+        """
+        ref_c2ws = c2ws
+        camera_dist_2med = torch.norm(
+            ref_c2ws[:, :3, 3] - ref_c2ws[:, :3, 3].median(0, keepdim=True).values,
+            dim=-1,
+        )
+        valid_mask = camera_dist_2med <= torch.clamp(
+            torch.quantile(camera_dist_2med, 0.97) * 10,
+            max=1e6,
+        )
+        c2ws[:, :3, 3] -= ref_c2ws[valid_mask, :3, 3].mean(0, keepdim=True)
+
+        # camera normalization
+        camera_dists = c2ws[:, :3, 3].clone()
+        translation_scaling_factor = (
+            self.camera_scale
+            if torch.isclose(
+                torch.norm(camera_dists[0]),
+                torch.zeros(1).to(self.device, self.dtype),
+                atol=1e-5,
+            ).any()
+            else (self.camera_scale / torch.norm(camera_dists[0]) + 0.01)
+        )
+        return translation_scaling_factor, c2ws
+    
+    
+    def get_cond(self, context_latents, all_c2ws, all_Ks, translation_scaling_factor, encoder_embeddings, input_masks):
+        context_encoder_embeddings = torch.mean(encoder_embeddings, dim=0)
+        input_masks = input_masks.bool()
+        
+        # batch_size = context_latents.shape[0]
+        all_c2ws[:, :, [1, 2]] *= -1
+        all_w2cs = torch.linalg.inv(all_c2ws)
+        all_c2ws[:, :3, 3] *= translation_scaling_factor
+        all_w2cs[:, :3, 3] *= translation_scaling_factor
+        num_cameras = all_w2cs.shape[0]
+
+        
+        pluckers = get_plucker_coordinates(
+            extrinsics_src=all_w2cs[:1],
+            extrinsics=all_w2cs,
+            intrinsics=all_Ks.float().clone(),
+            target_size=(context_latents.shape[-2], context_latents.shape[-1]),
+        ) # [B, 3, 6, H, W]
+        
+        target_latents = torch.nn.functional.pad(
+            torch.zeros(self.config.model.num_frames - context_latents.shape[0], *context_latents.shape[1:]), (0, 0, 0, 0, 0, 1), value=0
+        ).to(self.device, self.dtype)
+        context_latents = torch.nn.functional.pad(
+            context_latents, (0, 0, 0, 0, 0, 1), value=1.0
+        )
+
+        c_crossattn = repeat(context_encoder_embeddings, "d -> n 1 d", n=num_cameras)
+        # c_crossattn = repeat(context_encoder_embeddings, "b 1 d -> b n 1 d", n=num_cameras)
+ 
+        uc_crossattn = torch.zeros_like(c_crossattn)
+        c_replace = torch.zeros((num_cameras, *context_latents.shape[1:])).to(self.device)
+        c_replace[input_masks] = context_latents
+        c_replace[~input_masks] = target_latents
+        uc_replace = torch.zeros_like(c_replace)
+        c_concat = torch.cat(
+            [
+                repeat(
+                    input_masks,
+                    "n ->n 1 h w",
+                    h=pluckers.shape[-2],
+                    w=pluckers.shape[-1],
+                ),
+                pluckers,
+            ],
+            1,
+        )
+        uc_concat = torch.cat(
+            [torch.zeros((num_cameras, 1, *pluckers.shape[-2:])).to(self.device), pluckers], 1
+        )
+        c_dense_vector = pluckers
+        uc_dense_vector = c_dense_vector
+        c = {
+            "crossattn": c_crossattn,
+            "replace": c_replace,
+            "concat": c_concat,
+            "dense_vector": c_dense_vector,
+        }
+        uc = {
+            "crossattn": uc_crossattn,
+            "replace": uc_replace,
+            "concat": uc_concat,
+            "dense_vector": uc_dense_vector,
+        }
+    
+        return {"c": c, 
+                "uc": uc, 
+                "all_c2ws": all_c2ws, 
+                "all_Ks": all_Ks, 
+                "input_masks": input_masks,
+                "num_cameras": num_cameras}
+     
+    
+    def _generate_frames_for_trajectory(self, c2ws_tensor, Ks_tensor, use_non_maximum_suppression=None):
+        """
+        Internal helper method to generate frames for a trajectory.
+        
+        Args:
+            c2ws: List of camera-to-world matrices
+            Ks: List of camera intrinsic matrices
+
+        
+        Returns:
+            List of all generated PIL frames
+        """
+
+        padding_size = 0
+        # Determine generation steps based on trajectory length
+        generation_steps = (len(c2ws_tensor) + 1 - self.config.model.num_frames) // self.config.model.target_num_frames + 2
+        
+        # Generate frames in steps
+        cur_start_idx = 0
+        for i in range(generation_steps):
+            # Calculate frame indices for this step
+            if i > 0:
+                cur_start_idx = cur_end_idx
+            if len(self.pil_frames) == 1: # first frame
+                cur_end_idx = min(cur_start_idx + self.config.model.num_frames - 1, len(c2ws_tensor))
+            else:
+                cur_end_idx = min(cur_start_idx + self.config.model.target_num_frames, len(c2ws_tensor))
+            
+            target_length = cur_end_idx - cur_start_idx
+            if target_length <= 0:
+                break
+                
+            # Handle padding for target frames if needed
+            if target_length < self.config.model.target_num_frames or (len(self.pil_frames) == 1 and target_length < self.config.model.num_frames - 1):
+                # Pad target_c2ws and target_Ks with the last frame
+                if len(self.pil_frames) == 1: # first frame
+                    padding_size = self.config.model.num_frames - 1 - target_length
+                else:
+                    padding_size = self.config.model.target_num_frames - target_length
+                padding = torch.tile(c2ws_tensor[cur_end_idx-1:cur_end_idx], (padding_size, 1, 1))
+                c2ws_tensor = torch.cat([c2ws_tensor, padding], dim=0)
+                
+                padding_K = torch.tile(Ks_tensor[cur_end_idx-1:cur_end_idx], (padding_size, 1, 1))
+                Ks_tensor = torch.cat([Ks_tensor, padding_K], dim=0)
+                
+                if len(self.pil_frames) == 1:
+                    cur_end_idx = cur_start_idx + self.config.model.num_frames - 1
+                else:
+                    cur_end_idx = cur_start_idx + self.config.model.target_num_frames
+            
+            target_c2ws = c2ws_tensor[cur_start_idx:cur_end_idx]
+            target_Ks = Ks_tensor[cur_start_idx:cur_end_idx]
+            
+ 
+            context_info = self.get_context_info(target_c2ws, use_non_maximum_suppression)
+            
+            (context_c2ws, 
+             context_latents, 
+             context_encoder_embeddings, 
+             context_Ks,
+             context_time_indices) \
+                 = (context_info["context_c2ws"], 
+                    context_info["context_latents"], 
+                    context_info["context_encoder_embeddings"], 
+                    context_info["context_Ks"], 
+                    context_info["context_time_indices"])
+            
+            # Prepare conditioning
+            all_c2ws = torch.cat([context_c2ws, target_c2ws], dim=0)
+            all_Ks = torch.cat([context_Ks, target_Ks], dim=0)
+            translation_scaling_factor, all_c2ws = self.get_translation_scaling_factor(all_c2ws)
+            input_masks = torch.cat([torch.ones(len(context_c2ws)), torch.zeros(len(target_c2ws))], dim=0).bool().to(self.device)
+            cond = self.get_cond(context_latents, all_c2ws, all_Ks, translation_scaling_factor, context_encoder_embeddings, input_masks)
+
+            # Generate samples
+            samples, samples_z = do_sample(self.model_wrapper, 
+                                         self.vae, 
+                                         self.denoiser, 
+                                         self.sampler[0],
+                                         cond["c"],
+                                         cond["uc"],
+                                         cond["all_c2ws"],
+                                         cond["all_Ks"],
+                                         input_masks,
+                                         H=576, W=576, C=4, F=8, T=8, 
+                                         cfg=self.config.model.cfg,  
+                                         verbose=True, 
+                                         global_pbar=None, 
+                                         return_latents=True,
+                                         device=self.device)
+
+            # Process and store generated frames
+            target_num = torch.sum(~input_masks)
+            target_samples = samples[~input_masks]
+            target_pil_frames = [tensor_to_pil(target_samples[j]) for j in range(target_num)]
+            target_encoder_embeddings = encode_image(target_samples, self.image_encoder, self.device, self.dtype)
+            target_latents = samples_z[~input_masks]
+            
+            for j in range(target_num - padding_size if padding_size > 0 else target_num):
+                self.latents.append(target_latents[j].detach().cpu().numpy())
+                self.encoder_embeddings.append(target_encoder_embeddings[j].detach().cpu().numpy())
+                self.Ks.append(target_Ks[j].detach().cpu().numpy())
+                self.c2ws.append(target_c2ws[j].detach().cpu().numpy())
+                self.pil_frames.append(target_pil_frames[j])
+                
+                if self.config.inference.visualize:
+                    self.pil_frames[-1].save(f"{self.config.visualization_dir}/final_{len(self.pil_frames):07d}.png")
+            
+            # Update scene reconstruction if needed
+            if self.use_surfel and not self.temporal_only:
+                self.construct_and_store_scene(self.pil_frames, 
+                                            time_indices=context_time_indices,
+                                            niter=self.config.surfel.niter, 
+                                            lr=self.config.surfel.lr, 
+                                            device=self.device)
+            self.global_step += 1
+                        
+            if self.config.inference.visualize:
+                export_to_gif(self.pil_frames, f"{self.config.visualization_dir}/inference_all.gif")
+            
+        # Return all frames or just the new ones
+        return self.pil_frames[-self.config.model.target_num_frames:] if len(self.pil_frames) > self.config.model.target_num_frames + 1 else self.pil_frames
+    
+    def generate_trajectory_frames(self, c2ws: List[np.ndarray], Ks: List[np.ndarray], use_non_maximum_suppression=None):
+        """
+        Generate frames for a new trajectory segment while maintaining the pipeline state.
+        This allows for interactive navigation through a scene.
+        
+        Args:
+            c2ws: List of camera-to-world matrices for the new trajectory segment
+            Ks: List of camera intrinsic matrices for the new trajectory segment
+            
+        Returns:
+            List of PIL images for the newly generated frames
+        """
+        c2ws_tensor = torch.from_numpy(np.array(c2ws)).to(self.device, self.dtype)
+        Ks_tensor = torch.from_numpy(np.array(Ks)).to(self.device, self.dtype)
+        # translation_scaling_factor, c2ws_tensor = self.get_translation_scaling_factor(c2ws_tensor)
+        
+        return self._generate_frames_for_trajectory(c2ws_tensor, Ks_tensor, use_non_maximum_suppression)
+    
+    def undo_latest_move(self):
+        """
+        Undo the latest move by deleting the most recent batch of camera poses, embeddings, and pil images.
+        This allows stepping back in the trajectory if navigation went in an undesired direction.
+        
+        The method removes the last generated batch of frames (up to target_num_frames) since the pipeline
+        generates multiple frames at once during each generation step.
+        
+        Returns:
+            bool: True if successfully removed the latest frames, False if there's nothing to remove
+                 (e.g., only one frame in the pipeline)
+        """
+        # Ensure we have more than one frame to avoid removing the initial frame
+        if len(self.pil_frames) <= 1:
+            print("Cannot undo: only one frame in the pipeline")
+            return False
+        
+        # Determine how many frames to remove - up to target_num_frames
+        frames_to_remove = min(self.config.model.target_num_frames, len(self.pil_frames) - 1)
+        
+        # Remove the latest entries from all state lists
+        for _ in range(frames_to_remove):
+            self.latents.pop()
+            self.encoder_embeddings.pop()
+            self.c2ws.pop()
+            self.Ks.pop()
+            self.pil_frames.pop()
+            
+        
+        # Handle surfels if using reconstructor
+        self.global_step -= frames_to_remove
+        if self.use_surfel:
+            for _ in range(frames_to_remove):
+                self.surfel_depths.pop()
+
+                
+            # Find surfels that belong only to the removed timesteps
+            current_frame_count = len(self.pil_frames)
+            removed_timesteps = list(range(current_frame_count, current_frame_count + frames_to_remove))
+            surfels_to_remove = []
+            
+            # Loop through surfel_to_timestep and update
+            updated_surfel_to_timestep = {}
+            for i, timesteps in self.surfel_to_timestep.items():
+                # Check if this surfel only belongs to removed frames
+                if all(ts in removed_timesteps for ts in timesteps):
+                    surfels_to_remove.append(i)
+                else:
+                    # Keep this surfel but remove the timesteps of removed frames
+                    updated_timesteps = [ts for ts in timesteps if ts not in removed_timesteps]
+                    updated_surfel_to_timestep[i] = updated_timesteps
+            
+            # Now create new surfel list without the removed ones
+            updated_surfels = []
+            updated_final_surfel_to_timestep = {}
+            new_idx = 0
+            
+            for i, surfel in enumerate(self.surfels):
+                if i not in surfels_to_remove:
+                    updated_surfels.append(surfel)
+                    updated_final_surfel_to_timestep[new_idx] = updated_surfel_to_timestep[i]
+                    new_idx += 1
+            
+            # Update surfel data
+            self.surfels = updated_surfels
+            self.surfel_to_timestep = updated_final_surfel_to_timestep
+            
+        print(f"Successfully removed the latest {frames_to_remove} frames. {len(self.pil_frames)} frames remaining.")
+        return True
+        
+
+    
+    def __call__(self, image:torch.Tensor, c2ws: List[np.ndarray], Ks: List[np.ndarray]):
+        """
+        Process an initial image and generate frames for a trajectory.
+        
+        Args:
+            image: Initial image tensor
+            c2ws: Camera-to-world matrices for the trajectory
+            Ks: Camera intrinsic matrices for the trajectory
+            
+        Returns:
+            List of PIL images for all generated frames
+        """
+        # Initialize with the first frame
+        c2ws_tensor = torch.from_numpy(np.array(c2ws)).to(self.device, self.dtype)
+        
+        Ks_tensor = torch.from_numpy(np.array(Ks)).to(self.device, self.dtype)
+        
+        # translation_scaling_factor, c2ws_tensor = self.get_translation_scaling_factor(c2ws_tensor)
+        
+        self.initialize(image, c2ws_tensor[0].detach().cpu().numpy(), Ks_tensor[0].detach().cpu().numpy())
+        
+        return self._generate_frames_for_trajectory(c2ws_tensor[1:], Ks_tensor[1:])
+    
+
+ 
\ No newline at end of file
diff --git a/modeling/sampling.py b/modeling/sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..153731378e6921484918ef9a7565b0d59add0124
--- /dev/null
+++ b/modeling/sampling.py
@@ -0,0 +1,484 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from tqdm import tqdm
+
+# from seva.geometry import get_camera_dist
+from typing import Union
+
+
+def get_camera_dist(
+    source_c2ws: torch.Tensor,  # N x 3 x 4
+    target_c2ws: torch.Tensor,  # M x 3 x 4
+    mode: str = "translation",
+):
+    if mode == "rotation":
+        dists = torch.acos(
+            (
+                (
+                    torch.matmul(
+                        source_c2ws[:, None, :3, :3],
+                        target_c2ws[None, :, :3, :3].transpose(-1, -2),
+                    )
+                    .diagonal(offset=0, dim1=-2, dim2=-1)
+                    .sum(-1)
+                    - 1
+                )
+                / 2
+            ).clamp(-1, 1)
+        ) * (180 / torch.pi)
+    elif mode == "translation":
+        dists = torch.norm(
+            source_c2ws[:, None, :3, 3] - target_c2ws[None, :, :3, 3], dim=-1
+        )
+    else:
+        raise NotImplementedError(
+            f"Mode {mode} is not implemented for finding nearest source indices."
+        )
+    return dists
+
+
+
+def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+        )
+    return x[(...,) + (None,) * dims_to_append]
+
+
+def append_zero(x: torch.Tensor) -> torch.Tensor:
+    return torch.cat([x, x.new_zeros([1])])
+
+
+def to_d(x: torch.Tensor, sigma: torch.Tensor, denoised: torch.Tensor) -> torch.Tensor:
+    return (x - denoised) / append_dims(sigma, x.ndim)
+
+
+def make_betas(
+    num_timesteps: int, linear_start: float = 1e-4, linear_end: float = 2e-2
+) -> np.ndarray:
+    betas = (
+        torch.linspace(
+            linear_start**0.5, linear_end**0.5, num_timesteps, dtype=torch.float64
+        )
+        ** 2
+    )
+    return betas.numpy()
+
+
+def generate_roughly_equally_spaced_steps(
+    num_substeps: int, max_step: int
+) -> np.ndarray:
+    return np.linspace(max_step - 1, 0, num_substeps, endpoint=False).astype(int)[::-1]
+
+
+class EpsScaling(object):
+    def __call__(
+        self, sigma: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        c_skip = torch.ones_like(sigma, device=sigma.device)
+        c_out = -sigma
+        c_in = 1 / (sigma**2 + 1.0) ** 0.5
+        c_noise = sigma.clone()
+        return c_skip, c_out, c_in, c_noise
+
+
+class DDPMDiscretization(object):
+    def __init__(
+        self,
+        linear_start: float = 5e-06,
+        linear_end: float = 0.012,
+        num_timesteps: int = 1000,
+        log_snr_shift: Union[float, None] = 2.4,
+    ):
+        self.num_timesteps = num_timesteps
+
+        betas = make_betas(
+            num_timesteps,
+            linear_start=linear_start,
+            linear_end=linear_end,
+        )
+        self.log_snr_shift = log_snr_shift
+
+        alphas = 1.0 - betas  # first alpha here is on data side
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+
+    def get_sigmas(self, n: int, device: Union[str, torch.device] = "cpu") -> torch.Tensor:
+        if n < self.num_timesteps:
+            timesteps = generate_roughly_equally_spaced_steps(n, self.num_timesteps)
+            alphas_cumprod = self.alphas_cumprod[timesteps]
+        elif n == self.num_timesteps:
+            alphas_cumprod = self.alphas_cumprod
+        else:
+            raise ValueError(f"Expected n <= {self.num_timesteps}, but got n = {n}.")
+
+        sigmas = ((1 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        if self.log_snr_shift is not None:
+            sigmas = sigmas * np.exp(self.log_snr_shift)
+        return torch.flip(
+            torch.tensor(sigmas, dtype=torch.float32, device=device), (0,)
+        )
+
+    def __call__(
+        self,
+        n: int,
+        do_append_zero: bool = True,
+        flip: bool = False,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        sigmas = self.get_sigmas(n, device=device)
+        sigmas = append_zero(sigmas) if do_append_zero else sigmas
+        return sigmas if not flip else torch.flip(sigmas, (0,))
+
+
+class DiscreteDenoiser(object):
+    sigmas: torch.Tensor
+
+    def __init__(
+        self,
+        discretization: DDPMDiscretization,
+        num_idx: int = 1000,
+        device: Union[str, torch.device] = "cpu",
+    ):
+        self.scaling = EpsScaling()
+        self.discretization = discretization
+        self.num_idx = num_idx
+        self.device = device
+
+        self.register_sigmas()
+
+    def register_sigmas(self):
+        self.sigmas = self.discretization(
+            self.num_idx, do_append_zero=False, flip=True, device=self.device
+        )
+
+    def sigma_to_idx(self, sigma: torch.Tensor) -> torch.Tensor:
+        dists = sigma - self.sigmas[:, None]
+        return dists.abs().argmin(dim=0).view(sigma.shape)
+
+    def idx_to_sigma(self, idx: Union[torch.Tensor, int]) -> torch.Tensor:
+        return self.sigmas[idx]
+
+    def __call__(
+        self,
+        network: nn.Module,
+        input: torch.Tensor,
+        sigma: torch.Tensor,
+        cond: dict,
+        **additional_model_inputs,
+    ) -> torch.Tensor:
+        sigma = self.idx_to_sigma(self.sigma_to_idx(sigma))
+        sigma_shape = sigma.shape
+        sigma = append_dims(sigma, input.ndim)
+        c_skip, c_out, c_in, c_noise = self.scaling(sigma)
+        c_noise = self.sigma_to_idx(c_noise.reshape(sigma_shape))
+        if "replace" in cond:
+            x, mask = cond.get("replace").split((input.shape[1], 1), dim=1)
+        
+            input = input * (1 - mask) + x * mask
+        return (
+            network(input * c_in, c_noise, cond, **additional_model_inputs) * c_out
+            + input * c_skip
+        )
+
+
+class ConstantScaleRule(object):
+    def __call__(self, scale: Union[float, torch.Tensor]) -> Union[float, torch.Tensor]:
+        return scale
+
+
+class MultiviewScaleRule(object):
+    def __init__(self, min_scale: float = 1.0):
+        self.min_scale = min_scale
+
+    def __call__(
+        self,
+        scale: Union[float, torch.Tensor],
+        c2w: torch.Tensor,
+        K: torch.Tensor,
+        input_frame_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        c2w_input = c2w[input_frame_mask]
+        rotation_diff = get_camera_dist(c2w, c2w_input, mode="rotation").min(-1).values
+        translation_diff = (
+            get_camera_dist(c2w, c2w_input, mode="translation").min(-1).values
+        )
+        K_diff = (
+            ((K[:, None] - K[input_frame_mask][None]).flatten(-2) == 0).all(-1).any(-1)
+        )
+        close_frame = (rotation_diff < 10.0) & (translation_diff < 1e-5) & K_diff
+        if isinstance(scale, torch.Tensor):
+            scale = scale.clone()
+            scale[close_frame] = self.min_scale
+        elif isinstance(scale, float):
+            scale = torch.where(close_frame, self.min_scale, scale)
+        else:
+            raise ValueError(f"Invalid scale type {type(scale)}.")
+        return scale
+
+
+class ConstantScaleSchedule(object):
+    def __call__(
+        self, sigma: Union[float, torch.Tensor], scale: Union[float, torch.Tensor]
+    ) -> Union[float, torch.Tensor]:
+        if isinstance(sigma, float):
+            return scale
+        elif isinstance(sigma, torch.Tensor):
+            if len(sigma.shape) == 1 and isinstance(scale, torch.Tensor):
+                sigma = append_dims(sigma, scale.ndim)
+            return scale * torch.ones_like(sigma)
+        else:
+            raise ValueError(f"Invalid sigma type {type(sigma)}.")
+
+
+class ConstantGuidance(object):
+    def __call__(
+        self,
+        uncond: torch.Tensor,
+        cond: torch.Tensor,
+        scale: Union[float, torch.Tensor],
+    ) -> torch.Tensor:
+        if isinstance(scale, torch.Tensor) and len(scale.shape) == 1:
+            scale = append_dims(scale, cond.ndim)
+        return uncond + scale * (cond - uncond)
+
+
+class VanillaCFG(object):
+    def __init__(self):
+        self.scale_rule = ConstantScaleRule()
+        self.scale_schedule = ConstantScaleSchedule()
+        self.guidance = ConstantGuidance()
+
+    def __call__(
+        self, x: torch.Tensor, sigma: Union[float, torch.Tensor], scale: Union[float, torch.Tensor]
+    ) -> torch.Tensor:
+        x_u, x_c = x.chunk(2)
+        scale = self.scale_rule(scale)
+        scale_value = self.scale_schedule(sigma, scale)
+        x_pred = self.guidance(x_u, x_c, scale_value)
+        return x_pred
+
+    def prepare_inputs(
+        self, x: torch.Tensor, s: torch.Tensor, c: dict, uc: dict
+    ) -> tuple[torch.Tensor, torch.Tensor, dict]:
+        c_out = dict()
+
+        for k in c:
+            if k in ["vector", "crossattn", "concat", "replace", "dense_vector"]:
+                c_out[k] = torch.cat((uc[k], c[k]), 0)
+            else:
+                assert c[k] == uc[k]
+                c_out[k] = c[k]
+        return torch.cat([x] * 2), torch.cat([s] * 2), c_out
+
+
+class MultiviewCFG(VanillaCFG):
+    def __init__(self, cfg_min: float = 1.0):
+        self.scale_min = cfg_min
+        self.scale_rule = MultiviewScaleRule(min_scale=cfg_min)
+        self.scale_schedule = ConstantScaleSchedule()
+        self.guidance = ConstantGuidance()
+
+    def __call__(  # type: ignore
+        self,
+        x: torch.Tensor,
+        sigma: Union[float, torch.Tensor],
+        scale: Union[float, torch.Tensor],
+        c2w: torch.Tensor,
+        K: torch.Tensor,
+        input_frame_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        x_u, x_c = x.chunk(2)
+        scale = self.scale_rule(scale, c2w, K, input_frame_mask)
+        scale_value = self.scale_schedule(sigma, scale)
+        x_pred = self.guidance(x_u, x_c, scale_value)
+        return x_pred
+
+
+class MultiviewTemporalCFG(MultiviewCFG):
+    def __init__(self, num_frames: int, cfg_min: float = 1.0):
+        super().__init__(cfg_min=cfg_min)
+
+        self.num_frames = num_frames
+        distance_matrix = (
+            torch.arange(num_frames)[None] - torch.arange(num_frames)[:, None]
+        ).abs()
+        self.distance_matrix = distance_matrix
+
+    def __call__(
+        self,
+        x: torch.Tensor,
+        sigma: Union[float, torch.Tensor],
+        scale: Union[float, torch.Tensor],
+        c2w: torch.Tensor,
+        K: torch.Tensor,
+        input_frame_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        input_frame_mask = rearrange(
+            input_frame_mask, "(b t) ... -> b t ...", t=self.num_frames
+        )
+        min_distance = (
+            self.distance_matrix[None].to(x.device)
+            + (~input_frame_mask[:, None]) * self.num_frames
+        ).min(-1)[0]
+        min_distance = min_distance / min_distance.max(-1, keepdim=True)[0].clamp(min=1)
+        scale = min_distance * (scale - self.scale_min) + self.scale_min
+        scale = rearrange(scale, "b t ... -> (b t) ...")
+        scale = append_dims(scale, x.ndim)
+        return super().__call__(x, sigma, scale, c2w, K, input_frame_mask.flatten(0, 1))
+
+
+class EulerEDMSampler(object):
+    def __init__(
+        self,
+        discretization: DDPMDiscretization,
+        guider: Union[VanillaCFG, MultiviewCFG, MultiviewTemporalCFG],
+        num_steps: Union[int, None] = None,
+        verbose: bool = False,
+        device: Union[str, torch.device] = "cuda",
+        s_churn=0.0,
+        s_tmin=0.0,
+        s_tmax=float("inf"),
+        s_noise=1.0,
+    ):
+        self.num_steps = num_steps
+        self.discretization = discretization
+        self.guider = guider
+        self.verbose = verbose
+        self.device = device
+
+        self.s_churn = s_churn
+        self.s_tmin = s_tmin
+        self.s_tmax = s_tmax
+        self.s_noise = s_noise
+
+    def prepare_sampling_loop(
+        self, x: torch.Tensor, cond: dict, uc: dict, num_steps: Union[int, None] = None
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, dict, dict]:
+        num_steps = num_steps or self.num_steps
+        assert num_steps is not None, "num_steps must be specified"
+        sigmas = self.discretization(num_steps, device=self.device)
+        x *= torch.sqrt(1.0 + sigmas[0] ** 2.0)
+        num_sigmas = len(sigmas)
+        s_in = x.new_ones([x.shape[0]])
+        return x, s_in, sigmas, num_sigmas, cond, uc
+
+    def get_sigma_gen(self, num_sigmas: int, verbose: bool = True) -> Union[range, tqdm]:
+        sigma_generator = range(num_sigmas - 1)
+        if self.verbose and verbose:
+            sigma_generator = tqdm(
+                sigma_generator,
+                total=num_sigmas - 1,
+                desc="Sampling",
+                leave=False,
+            )
+        return sigma_generator
+
+    def sampler_step(
+        self,
+        sigma: torch.Tensor,
+        next_sigma: torch.Tensor,
+        denoiser,
+        x: torch.Tensor,
+        scale: Union[float, torch.Tensor],
+        cond: dict,
+        uc: dict,
+        gamma: float = 0.0,
+        **guider_kwargs,
+    ) -> torch.Tensor:
+        sigma_hat = sigma * (gamma + 1.0) + 1e-6
+
+        eps = torch.randn_like(x) * self.s_noise
+        x = x + eps * append_dims(sigma_hat**2 - sigma**2, x.ndim) ** 0.5
+
+        denoised = denoiser(*self.guider.prepare_inputs(x, sigma_hat, cond, uc))
+        denoised = self.guider(denoised, sigma_hat, scale, **guider_kwargs)
+        d = to_d(x, sigma_hat, denoised)
+        dt = append_dims(next_sigma - sigma_hat, x.ndim)
+        return x + dt * d
+
+    def __call__(
+        self,
+        denoiser,
+        x: torch.Tensor,
+        scale: Union[float, torch.Tensor],
+        cond: dict,
+        uc: Union[dict, None] = None,
+        num_steps: Union[int, None] = None,
+        verbose: bool = True,
+        **guider_kwargs,
+    ) -> torch.Tensor:
+        uc = cond if uc is None else uc
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            x,
+            cond,
+            uc,
+            num_steps,
+        )
+        for i in self.get_sigma_gen(num_sigmas, verbose=verbose):
+            gamma = (
+                min(self.s_churn / (num_sigmas - 1), 2**0.5 - 1)
+                if self.s_tmin <= sigmas[i] <= self.s_tmax
+                else 0.0
+            )
+            x = self.sampler_step(
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                denoiser,
+                x,
+                scale,
+                cond,
+                uc,
+                gamma,
+                **guider_kwargs,
+            )
+        return x
+
+
+def create_samplers(
+    guider_types: Union[int, list[int]],
+    discretization,
+    num_frames: Union[list[int], None],
+    num_steps: int=50,
+    cfg_min: float = 1.2,
+    device: Union[str, torch.device] = "cuda"
+):
+    guider_mapping = {
+        0: VanillaCFG,
+        1: MultiviewCFG,
+        2: MultiviewTemporalCFG,
+    }
+    samplers = []
+    if not isinstance(guider_types, (list, tuple)):
+        guider_types = [guider_types]
+    for i, guider_type in enumerate(guider_types):
+        if guider_type not in guider_mapping:
+            raise ValueError(
+                f"Invalid guider type {guider_type}. Must be one of {list(guider_mapping.keys())}"
+            )
+        guider_cls = guider_mapping[guider_type]
+        guider_args = ()
+        if guider_type > 0:
+            guider_args += (cfg_min,)
+            if guider_type == 2:
+                assert num_frames is not None
+                guider_args = (num_frames[i], cfg_min)
+        guider = guider_cls(*guider_args)
+
+        sampler = EulerEDMSampler(
+            discretization=discretization,
+            guider=guider,
+            num_steps=num_steps,
+            s_churn=0.0,
+            s_tmin=0.0,
+            s_tmax=999.0,
+            s_noise=1.0,
+            verbose=True,
+            device=device,
+        )
+        samplers.append(sampler)
+    return samplers
diff --git a/navigation.py b/navigation.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a8ba029ee6f151f2dc7c1dcd4505c02c99d575
--- /dev/null
+++ b/navigation.py
@@ -0,0 +1,478 @@
+import numpy as np
+import torch
+from PIL import Image
+import argparse
+import os
+import json
+from typing import List, Optional, Tuple
+import scipy.spatial.transform as spt
+from omegaconf import OmegaConf
+
+
+from modeling.pipeline import VMemPipeline
+from utils import load_img_and_K, transform_img_and_K, get_default_intrinsics
+
+
+
+
+
+class Navigator:
+    """
+    Navigator class for moving through a 3D scene with a virtual camera.
+    Provides methods to move forward and turn left/right, generating new camera poses
+    and rendering frames using VMemPipeline.
+    """
+    def __init__(self, pipeline: VMemPipeline, step_size: float = 0.1, num_interpolation_frames: int = 4):
+        """
+        Initialize the Navigator.
+        
+        Args:
+            pipeline: The VMemPipeline used for rendering frames
+            step_size: The distance to move forward with each step
+            num_interpolation_frames: Number of frames to generate for each movement
+        """
+        self.pipeline = pipeline
+        self.step_size = step_size
+        self.current_pose = None
+        self.current_K = None
+        self.frames = []
+        self.num_interpolation_frames = num_interpolation_frames
+        self.pose_history = []  # Store history of camera poses
+        
+    def initialize(self, image, initial_pose, initial_K):
+        """
+        Initialize the navigator with an image and camera parameters.
+        Uses the pipeline's initialize method to set up the state.
+        
+        Args:
+            image: Initial image tensor
+            initial_pose: Initial camera pose (4x4 camera-to-world matrix)
+            initial_K: Initial camera intrinsics matrix
+            
+        Returns:
+            The initial frame as PIL Image
+        """
+        self.current_pose = initial_pose
+        self.current_K = initial_K
+        
+        # Use the pipeline's initialize method
+        initial_frame = self.pipeline.initialize(image, initial_pose, initial_K)
+        self.frames = [initial_frame]
+        
+        # Save the initial pose
+        self.pose_history.append({
+            "file_path": f"images/frame_001.png",
+            "transform_matrix": initial_pose.tolist() if isinstance(initial_pose, np.ndarray) else initial_pose
+        })
+        
+        return initial_frame
+        
+    def initialize_pipeline(self, image_tensor, initial_pose, initial_K):
+        """
+        Initialize the pipeline with the first image and camera pose.
+        Deprecated: Use initialize() instead.
+        
+        Args:
+            image_tensor: Initial image tensor [1, C, H, W]
+            initial_pose: Initial camera pose (4x4 camera-to-world matrix)
+            initial_K: Initial camera intrinsics matrix
+            
+        Returns:
+            The generated frame as PIL Image
+        """
+        return self.initialize(image_tensor, initial_pose, initial_K)
+        
+    def initialize_with_image(self, image, initial_pose, initial_K):
+        """
+        Initialize the navigator with an image and camera parameters.
+        Deprecated: Use initialize() instead.
+        
+        Args:
+            image: Initial image tensor
+            initial_pose: Initial camera pose (4x4 camera-to-world matrix)
+            initial_K: Initial camera intrinsics matrix
+            
+        Returns:
+            The generated frame as PIL Image
+        """
+        return self.initialize(image, initial_pose, initial_K)
+    
+    def _interpolate_poses(self, start_pose, end_pose, num_frames):
+        """
+        Interpolate between two camera poses.
+        
+        Args:
+            start_pose: Starting camera pose (4x4 matrix)
+            end_pose: Ending camera pose (4x4 matrix)
+            num_frames: Number of interpolation frames to generate (including end pose)
+            
+        Returns:
+            List of interpolated camera poses
+        """
+        # Extract rotation matrices
+        start_R = start_pose[:3, :3]
+        end_R = end_pose[:3, :3]
+        
+        # Extract translation vectors
+        start_t = start_pose[:3, 3]
+        end_t = end_pose[:3, 3]
+        
+        # Convert rotation matrices to quaternions for smooth interpolation
+        start_quat = spt.Rotation.from_matrix(start_R).as_quat()
+        end_quat = spt.Rotation.from_matrix(end_R).as_quat()
+        
+        # Generate interpolated poses
+        interpolated_poses = []
+        for i in range(num_frames):
+            # Interpolation factor (0 to 1)
+            t = (i + 1) / num_frames
+            
+            # Interpolate translation
+            interp_t = (1 - t) * start_t + t * end_t
+            
+            # Interpolate rotation (SLERP)
+            interp_quat = spt.Slerp(
+                np.array([0, 1]), 
+                spt.Rotation.from_quat([start_quat, end_quat])
+            )(t).as_matrix()
+            
+            # Create interpolated pose matrix
+            interp_pose = np.eye(4)
+            interp_pose[:3, :3] = interp_quat
+            interp_pose[:3, 3] = interp_t
+            
+            interpolated_poses.append(interp_pose)
+            
+        return interpolated_poses
+        
+    def move_backward(self, num_steps: int = 1) -> List[Image.Image]:
+        """
+        Move the camera backward along its viewing direction with smooth interpolation.
+        
+        Args:
+            num_steps: Number of steps to move forward
+            
+        Returns:
+            List of generated frames as PIL Images
+        """
+        if self.current_pose is None:
+            print("Navigator not initialized. Call initialize first.")
+            return None
+        
+        # Get the current forward direction from the camera pose
+        forward_dir = self.current_pose[:3, 2]
+        
+        # Create the target pose
+        target_pose = self.current_pose.copy()
+        target_pose[:3, 3] += forward_dir * self.step_size * num_steps
+        
+        # Interpolate between current pose and target pose
+        interpolated_poses = self._interpolate_poses(
+            self.current_pose, 
+            target_pose, 
+            self.num_interpolation_frames
+        )
+        
+        # Create list of intrinsics (same for all frames)
+        interpolated_Ks = [self.current_K] * len(interpolated_poses)
+        
+        # Generate frames for interpolated poses
+        new_frames = self.pipeline.generate_trajectory_frames(interpolated_poses, 
+                                                              interpolated_Ks,
+                                                              use_non_maximum_suppression=False)
+        
+        # Update the current pose to the final pose
+        self.current_pose = interpolated_poses[-1]
+        self.frames.extend(new_frames)
+        
+        # Save the final pose
+        self.pose_history.append({
+            "file_path": f"images/frame_{len(self.pose_history) + 1:03d}.png",
+            "transform_matrix": self.current_pose.tolist() if isinstance(self.current_pose, np.ndarray) else self.current_pose
+        })
+        
+        return new_frames
+    
+        
+    def move_forward(self, num_steps: int = 1) -> List[Image.Image]:
+        """
+        Move the camera forward along its viewing direction with smooth interpolation.
+        
+        Args:
+            num_steps: Number of steps to move forward
+            
+        Returns:
+            List of generated frames as PIL Images
+        """
+        if self.current_pose is None:
+            print("Navigator not initialized. Call initialize first.")
+            return None
+        
+        # Get the current forward direction from the camera pose
+        forward_dir = self.current_pose[:3, 2]
+        
+        # Create the target pose
+        target_pose = self.current_pose.copy()
+        target_pose[:3, 3] -= forward_dir * self.step_size * num_steps
+        
+        # Interpolate between current pose and target pose
+        interpolated_poses = self._interpolate_poses(
+            self.current_pose, 
+            target_pose, 
+            self.num_interpolation_frames
+        )
+        
+        # Create list of intrinsics (same for all frames)
+        interpolated_Ks = [self.current_K] * len(interpolated_poses)
+        
+        # Generate frames for interpolated poses
+        new_frames = self.pipeline.generate_trajectory_frames(interpolated_poses, 
+                                                              interpolated_Ks,
+                                                              use_non_maximum_suppression=False)
+        
+        # Update the current pose to the final pose
+        self.current_pose = interpolated_poses[-1]
+        self.frames.extend(new_frames)
+        
+        # Save the final pose
+        self.pose_history.append({
+            "file_path": f"images/frame_{len(self.pose_history) + 1:03d}.png",
+            "transform_matrix": self.current_pose.tolist() if isinstance(self.current_pose, np.ndarray) else self.current_pose
+        })
+        
+        return new_frames
+    
+    def turn_left(self, degrees: float = 3) -> List[Image.Image]:
+        """
+        Rotate the camera left around the up vector with smooth interpolation.
+        
+        Args:
+            degrees: Rotation angle in degrees
+            
+        Returns:
+            List of generated frames as PIL Images
+        """
+        return self._turn(degrees)
+    
+    def turn_right(self, degrees: float = 3) -> List[Image.Image]:
+        """
+        Rotate the camera right around the up vector with smooth interpolation.
+        
+        Args:
+            degrees: Rotation angle in degrees
+            
+        Returns:
+            List of generated frames as PIL Images
+        """
+        return self._turn(-degrees)
+    
+    def _turn(self, degrees: float) -> List[Image.Image]:
+        """
+        Helper method to turn the camera by the specified angle with smooth interpolation.
+        Positive angles turn left, negative angles turn right.
+        
+        Args:
+            degrees: Rotation angle in degrees
+            
+        Returns:
+            List of generated frames as PIL Images
+        """
+        if self.current_pose is None:
+            print("Navigator not initialized. Call initialize first.")
+            return None
+        
+        # Convert degrees to radians
+        angle_rad = np.radians(degrees)
+        
+        # Create rotation matrix around the up axis (assuming Y is up)
+        rotation = np.array([
+            [np.cos(angle_rad), 0, np.sin(angle_rad), 0],
+            [0, 1, 0, 0],
+            [-np.sin(angle_rad), 0, np.cos(angle_rad), 0],
+            [0, 0, 0, 1]
+        ])
+        
+        # Apply rotation to the current pose
+        position = self.current_pose[:3, 3].copy()
+        rotation_matrix = self.current_pose[:3, :3].copy()
+        
+        # Create the target pose
+        target_pose = np.eye(4)
+        target_pose[:3, :3] = rotation[:3, :3] @ rotation_matrix
+        target_pose[:3, 3] = position
+        
+        # Interpolate between current pose and target pose
+        interpolated_poses = self._interpolate_poses(
+            self.current_pose, 
+            target_pose, 
+            self.num_interpolation_frames
+        )
+        
+        # Create list of intrinsics (same for all frames)
+    
+        interpolated_Ks = [self.current_K] * len(interpolated_poses)
+        
+        # Generate frames for interpolated poses
+        new_frames = self.pipeline.generate_trajectory_frames(interpolated_poses, interpolated_Ks)
+        
+        # Update the current pose to the final pose
+        self.current_pose = interpolated_poses[-1]
+        self.frames.extend(new_frames)
+        
+        # Save the final pose
+        self.pose_history.append({
+            "file_path": f"images/frame_{len(self.pose_history) + 1:03d}.png",
+            "transform_matrix": self.current_pose.tolist() if isinstance(self.current_pose, np.ndarray) else self.current_pose
+        })
+        
+        return new_frames
+    
+ 
+    
+    def navigate(self, commands: List[str]) -> List[List[Image.Image]]:
+        """
+        Execute a series of navigation commands and return the generated frames.
+        
+        Args:
+            commands: List of commands ('w', 'a', 'd', 'q')
+            
+        Returns:
+            List of lists of generated frames, one list per command
+        """
+        if self.current_pose is None:
+            print("Navigator not initialized. Call initialize first.")
+            return []
+        
+        all_generated_frames = []
+        
+        for idx, cmd in enumerate(commands):
+        
+            if cmd == 'w':
+                frames = self.move_forward()
+                if frames:
+                    all_generated_frames.extend(frames)
+            elif cmd == 's':
+                # self.pipeline.temporal_only = True
+                frames = self.move_backward()
+                if frames:
+                    all_generated_frames.extend(frames)
+            elif cmd == 'a':
+                frames = self.turn_left(4)
+                if frames:
+                    all_generated_frames.extend(frames)
+            elif cmd == 'd':
+                frames = self.turn_right(4)
+                if frames:
+                    all_generated_frames.extend(frames)
+
+
+        
+        return all_generated_frames
+
+    def undo(self) -> bool:
+        """
+        Undo the last navigation step by removing the most recent frames and poses.
+        Uses the pipeline's undo_latest_move method to handle the frame removal.
+        
+        Returns:
+            bool: True if undo was successful, False otherwise
+        """
+        # Check if we have enough poses to undo
+        if len(self.pose_history) <= 1:
+            print("Cannot undo: at initial position")
+            return False
+            
+        # Use pipeline's undo function to remove the last batch of frames
+        success = self.pipeline.undo_latest_move()
+        
+        if success:
+            # Remove the last pose from history
+            self.pose_history.pop()
+            
+            # Set current pose to the previous pose
+            prev_pose_data = self.pose_history[-1]
+            self.current_pose = np.array(prev_pose_data["transform_matrix"])
+            
+            # Remove frames from the frames list
+            frames_to_remove = min(self.pipeline.config.model.target_num_frames, len(self.frames) - 1)
+            for _ in range(frames_to_remove):
+                if len(self.frames) > 1:  # Keep at least the initial frame
+                    self.frames.pop()
+            
+            print(f"Successfully undid last movement. Now at position {len(self.pose_history)}")
+            return True
+        
+        return False
+
+    def save_camera_poses(self, output_path):
+        """
+        Save the camera pose history to a JSON file in the format
+        required for NeRF training.
+        
+        Args:
+            output_path: Path to save the JSON file
+        """
+        # Create the output directory if it doesn't exist
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        
+        # Format the data as required
+        transforms_data = {
+            "frames": self.pose_history
+        }
+        
+        # Save to JSON file
+        with open(output_path, 'w') as f:
+            json.dump(transforms_data, f, indent=4)
+        
+        print(f"Camera poses saved to {output_path}")
+    
+   
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Interactive navigation in VMem")
+    parser.add_argument("--config", type=str, default="configs/inference/inference.yaml", help="Path to config file")
+    parser.add_argument("--step_size", type=float, default=0.1, help="Forward step size")
+    parser.add_argument("--interpolation_frames", type=int, default=4, help="Number of frames for each movement")
+    parser.add_argument("--commands", type=str, default="a,a,a,a,a,d,d,d,d,d,d,w,w,w,w,a,a,a,a,d,d,d,d,s,s,s,s", help="Comma-separated commands to execute (w,a,s,d,c,q) where c is circulate")
+    # parser.add_argument("--commands", type=str, default="d,d,d,d,w,w,w,d,d,d,d,d,a,a,a,a,a,s,s", help="Comma-separated commands to execute (w,a,s,d,c,q) where c is circulate")
+    parser.add_argument("--output_dir", type=str, default="./visualization/navigation_frames", help="Directory to save output frames")
+    parser.add_argument("--save_poses", type=str, default="./visualization/transforms.json", help="Path to save camera poses in NeRF format")
+    args = parser.parse_args()
+    
+    # Load configuration
+    config = OmegaConf.load(args.config)
+    
+    # Initialize the pipeline
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    pipeline = VMemPipeline(config, device=device)
+    
+    # Create the navigator
+    navigator = Navigator(pipeline, step_size=args.step_size, num_interpolation_frames=args.interpolation_frames)
+    
+    # Load episode data
+    frame_path = "test_samples/oxford.jpeg"
+    image, _ = load_img_and_K(frame_path, None, K=None, device=device)
+    image, _ = transform_img_and_K(image, (config.model.height, config.model.width), mode="crop", K=None)
+    ori_K = np.array(get_default_intrinsics()[0])
+    initial_pose = np.eye(4)
+    
+    # Initialize the navigator with the first frame using pipeline's initialize method
+    initial_frame = navigator.initialize(image, initial_pose, ori_K)
+    
+    # Create output directory if needed
+    if args.output_dir:
+        os.makedirs(args.output_dir, exist_ok=True)
+        initial_frame.save(os.path.join(args.output_dir, "initial.png"))
+    
+    # If commands are provided, execute them in sequence
+    commands = args.commands.split(',')
+    all_frames_lists = navigator.navigate(commands)
+    
+
+    # Save camera poses
+    if args.save_poses:
+        navigator.save_camera_poses(args.save_poses)
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..30c0816297b11dc0b8c07c3790e85491b30b2c26
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,43 @@
+--extra-index-url https://download.pytorch.org/whl/nightly/cu124
+torch==2.7.0
+torchvision==0.22.0
+
+pydantic
+gradio
+matplotlib
+tqdm
+opencv-python
+scipy
+einops
+trimesh
+tensorboard
+transformers
+pyglet<2
+huggingface-hub[torch]
+pillow-heif
+pyrender
+kapture
+kapture-localization
+numpy==1.24.4
+numpy-quaternion
+pycolmap  # for pnp
+poselib  # for pnp
+viser
+tyro
+ninja
+colorama
+pytorch-lightning
+splines
+diffusers
+kornia
+open-clip-torch
+accelerate
+imageio[ffmpeg]
+roma
+spaces
+omegaconf
+wandb
+evo
+open3d
+
+-e ./extern/CUT3R/src/croco/models/curope
\ No newline at end of file
diff --git a/test_samples/changi.jpg b/test_samples/changi.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e0c005af9eedc6a46a6459ca31185477a669ddb0
--- /dev/null
+++ b/test_samples/changi.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12fc2c4ddfccee952d5390b147b813a8f062209832ec675013f82283e796e54a
+size 4524349
diff --git a/test_samples/friends.jpg b/test_samples/friends.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..71f356e02f68f5532f611f8a38062b640dec373d
--- /dev/null
+++ b/test_samples/friends.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95d5e75d7d00411d002a342a158621f82fd188d0a5dd961622f633042abfc193
+size 1174763
diff --git a/test_samples/jesus.jpg b/test_samples/jesus.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6214d7a529248da54041e8603b848849b4a94981
--- /dev/null
+++ b/test_samples/jesus.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9837f15bc2bd94f1470b88bd93bb228a9b03103b314f7c00a6de74e12a4a78b4
+size 577227
diff --git a/test_samples/open_door.jpg b/test_samples/open_door.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3324850abd057b6114ed53a85319ead2420ac7db
--- /dev/null
+++ b/test_samples/open_door.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86586104166d9dca895e5539eb5b0eb1f6d3b605a3f5294cb94b0714c8366734
+size 75052
diff --git a/test_samples/oxford.jpeg b/test_samples/oxford.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..27f44dae7e328998d2420011bb048d27bcd8aea0
--- /dev/null
+++ b/test_samples/oxford.jpeg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21ca09596ca6c736f9111296646d8947289165bd76642cf8a6c6ebe2fef61ed1
+size 138854
diff --git a/train.py b/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..0587ef41e04c86bebbaea4c231ed0334dcfaba65
--- /dev/null
+++ b/train.py
@@ -0,0 +1,263 @@
+import argparse
+from datetime import datetime
+import random
+import os
+import time
+import multiprocessing
+
+# Set multiprocessing start method to 'spawn' to avoid CUDA initialization issues in forked processes
+multiprocessing.set_start_method('spawn', force=True)
+
+
+from tqdm.auto import tqdm  # Progress bar
+import numpy as np
+from omegaconf import OmegaConf
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from torch.optim.lr_scheduler import SequentialLR, LambdaLR, CosineAnnealingLR, ExponentialLR # Importing CosineAnnealingLR scheduler
+import torch.nn.functional as F
+
+
+
+from accelerate import Accelerator, DistributedDataParallelKwargs
+from accelerate.utils import set_seed  # Removed get_scheduler import
+
+from peft import get_peft_model, LoraConfig
+
+from modeling import VMemModel
+from modeling.modules.autoencoder import AutoEncoder
+from modeling.sampling import DDPMDiscretization, DiscreteDenoiser, create_samplers
+from modeling.modules.conditioner import CLIPConditioner
+
+from utils.training_utils import  DiffusionTrainer, load_pretrained_model
+from data.dataset import RealEstatePoseImageSevaDataset
+
+
+
+
+# set random seed for reproducibility
+torch.manual_seed(42)
+random.seed(42)
+np.random.seed(42)
+
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a model')
+    parser.add_argument('--config', type=str, default="", required=True, help='Path to the config file')
+    args = parser.parse_args()
+    return args
+
+
+def generate_current_datetime():
+    return datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+def prepare_model(unet, config):
+    assert isinstance(unet, VMemModel), "unet should be an instance of VMemModel"
+    if config.training.lora_flag:
+        target_modules = []
+        for name, param in unet.named_parameters():
+            # # if ("temporal" in name or "transformer" in name) and "norm" not in name:
+            print(name)
+            if ("transformer" in name or "emb" in name or "layers" in name) \
+                and "norm" not in name and "in_layers.0" not in name and "out_layers.0" not in name:
+                # print(name)
+                name = name.replace(".weight", "")
+                name = name.replace(".bias", "")
+                if name not in target_modules:
+                    target_modules.append(str(name))
+        
+        lora_config = LoraConfig(   
+            r=config.training.lora_r,
+            lora_alpha=config.training.lora_alpha,
+            target_modules=target_modules,
+            lora_dropout=config.training.lora_dropout,
+            # bias="none",
+        )
+        lora_config.target_modules = target_modules
+
+        unet = get_peft_model(unet, lora_config)
+        # for name, param in unet.named_parameters():
+        #     if "camera" in name or "control" in name or "context" in name or "epipolar" in name or "appearance" in name:
+        #         print(name)
+        #         param.requires_grad = True
+   
+        unet.print_trainable_parameters()
+    else:
+        for name, param in unet.named_parameters():
+            param.requires_grad = True
+         
+        print("trainable parameters percentage: ", np.sum([p.numel() for p in unet.parameters() if p.requires_grad])/np.sum([p.numel() for p in unet.parameters()]))
+    return unet
+
+
+    
+ 
+def main():
+    args = parse_args()
+    config_path = args.config
+    config = OmegaConf.load(config_path)
+
+    # Load the configuration
+    num_epochs = config.training.num_epochs
+    batch_size = config.training.batch_size
+    learning_rate = config.training.learning_rate
+    gradient_accumulation_steps = config.training.gradient_accumulation_steps
+    num_workers = config.training.num_workers
+    warmup_epochs = config.training.warmup_epochs
+    max_grad_norm = config.training.max_grad_norm
+    validation_interval = config.training.validation_interval
+    visualization_flag = config.training.visualization_flag
+    visualize_every = config.training.visualize_every
+    random_seed = config.training.random_seed
+    save_flag = config.training.save_flag
+    use_wandb = config.training.use_wandb
+    samples_dir = config.training.samples_dir
+
+
+    
+    weights_save_dir = config.training.weights_save_dir
+    
+
+    resume = config.training.resume
+
+
+
+    exp_id = generate_current_datetime()
+    if visualization_flag:
+        run_visualization_dir = f"{samples_dir}/{exp_id}"
+        os.makedirs(run_visualization_dir, exist_ok=True)
+    else:
+        run_visualization_dir = None
+    if save_flag:
+        run_weights_save_dir = f"{weights_save_dir}/{exp_id}"
+        os.makedirs(run_weights_save_dir, exist_ok=True)
+    else:
+        run_weights_save_dir = None
+
+
+    accelerator = Accelerator(
+        mixed_precision="fp16",
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        kwargs_handlers=[DistributedDataParallelKwargs(find_unused_parameters=False)],
+    )
+    num_gpus = accelerator.num_processes 
+
+    if random_seed is not None:
+        set_seed(random_seed, device_specific=True)
+    device = accelerator.device
+
+    
+
+    model = load_pretrained_model(cache_dir=config.model.cache_dir, device=device)
+
+
+    model = prepare_model(model, config)
+    if resume:
+        model.load_state_dict(torch.load(resume, map_location='cpu'), strict=False)
+        torch.cuda.empty_cache()
+    
+    # model = model.to(device)
+
+
+    # time.sleep(100*3600)
+
+      
+
+    train_dataset = RealEstatePoseImageSevaDataset(rgb_data_dir=config.dataset.realestate10k.rgb_data_dir, 
+                                                    meta_info_dir=config.dataset.realestate10k.meta_info_dir,
+                                                    num_sample_per_episode=config.dataset.realestate10k.num_sample_per_episode,
+                                                    mode='train')
+    val_dataset = RealEstatePoseImageSevaDataset(rgb_data_dir=config.dataset.realestate10k.rgb_data_dir, 
+                                                    meta_info_dir=config.dataset.realestate10k.meta_info_dir, 
+                                                    num_sample_per_episode=config.dataset.realestate10k.val_num_sample_per_episode,
+                                                    mode='test')
+
+        
+    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, multiprocessing_context='spawn')
+    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, multiprocessing_context='spawn')
+    
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=config.training.weight_decay)
+    train_steps_per_epoch = len(train_dataloader)
+    total_train_steps = num_epochs * train_steps_per_epoch 
+    warmup_steps = warmup_epochs * train_steps_per_epoch
+    
+    lr_scheduler = CosineAnnealingLR(
+        optimizer, T_max=total_train_steps - warmup_steps, eta_min=0
+    )
+    
+    # lr_scheduler = ExponentialLR(optimizer, gamma=gamma)
+    if warmup_epochs > 0:
+        def warmup_lambda(current_step):
+            return float(current_step) / float(max(1, warmup_steps))
+        warmup_scheduler = LambdaLR(optimizer, lr_lambda=warmup_lambda)
+
+
+        # Combine the schedulers using SequentialLR
+        lr_scheduler = SequentialLR(
+            optimizer, schedulers=[warmup_scheduler, lr_scheduler], milestones=[warmup_steps]
+        )
+    vae = AutoEncoder(chunk_size=1).to(device)
+    vae.eval()
+    conditioner = CLIPConditioner().to(device)
+    discretization = DDPMDiscretization()
+    denoiser = DiscreteDenoiser(discretization=discretization, num_idx=1000, device=device)
+    sampler = create_samplers(guider_types=config.training.guider_types,
+                              discretization=discretization,
+                              num_frames=config.model.num_frames,
+                              num_steps=config.training.inference_num_steps,
+                              cfg_min=config.training.cfg_min,
+                              device=device)
+
+
+    (model,
+    vae,
+    train_dataloader,
+    val_dataloader,
+    optimizer,
+    lr_scheduler) = accelerator.prepare(
+        model,
+        vae,
+        train_dataloader,
+        val_dataloader,
+        optimizer,
+        lr_scheduler,
+    )
+
+    
+    trainer = DiffusionTrainer(network=model,
+                               ae=vae,
+                               conditioner=conditioner,
+                               denoiser=denoiser,
+                               sampler=sampler,
+                               discretization=discretization,
+                               cfg=config.training.cfg,
+                               optimizer=optimizer,
+                               lr_scheduler=lr_scheduler,
+                               ema_decay=config.training.ema_decay,
+                               device=device,
+                               accelerator=accelerator,
+                               max_grad_norm=max_grad_norm,
+                               save_flag=save_flag,
+                               visualize_flag=visualization_flag)
+
+
+
+    trainer.train(train_dataloader, 
+                  num_epochs,
+                  unconditional_prob=config.training.uncond_prob,
+                  log_every=10, 
+                  validation_dataloader=val_dataloader, 
+                  validation_interval=validation_interval, 
+                  save_dir=run_weights_save_dir, 
+                  save_interval=config.training.save_every, 
+                  visualize_every=visualize_every, 
+                  visualize_dir=run_visualization_dir,
+                  use_wandb=use_wandb)
+
+
+if __name__ == "__main__":
+    main()
+    
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d221742edc00a79a221c7923cf04b026dd2f173b
--- /dev/null
+++ b/utils/__init__.py
@@ -0,0 +1 @@
+from .util import *
\ No newline at end of file
diff --git a/utils/__pycache__/__init__.cpython-310.pyc b/utils/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e470d6a68faf3107754f58b85abd3b7eb5b8012a
Binary files /dev/null and b/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/utils/__pycache__/__init__.cpython-39.pyc b/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a618d679b74a7fa86824a55bb56ebbbef0a16f6
Binary files /dev/null and b/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/utils/__pycache__/training_utils.cpython-310.pyc b/utils/__pycache__/training_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5678d5c2566f85ccb5da43c4ab67267a2a1bb43
Binary files /dev/null and b/utils/__pycache__/training_utils.cpython-310.pyc differ
diff --git a/utils/__pycache__/training_utils.cpython-39.pyc b/utils/__pycache__/training_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6f9e2208034848144b1f6718717d0ced2cd8921
Binary files /dev/null and b/utils/__pycache__/training_utils.cpython-39.pyc differ
diff --git a/utils/__pycache__/util.cpython-310.pyc b/utils/__pycache__/util.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af0c0063c92edad2a113152b0409704f07443fad
Binary files /dev/null and b/utils/__pycache__/util.cpython-310.pyc differ
diff --git a/utils/__pycache__/util.cpython-39.pyc b/utils/__pycache__/util.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04f9842b9ce916322869d5f37f672db83eae245f
Binary files /dev/null and b/utils/__pycache__/util.cpython-39.pyc differ
diff --git a/utils/util.py b/utils/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..744eff2c147c40f606367f1b82a6c44be92b489e
--- /dev/null
+++ b/utils/util.py
@@ -0,0 +1,1363 @@
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+import torchvision.transforms.functional as TF
+
+
+import kornia
+from matplotlib import cm
+from torchvision.io import write_video
+from PIL import Image, ImageOps
+import os
+from typing import Union, Tuple, List
+import math
+
+
+from matplotlib import pyplot as plt
+from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+
+DEFAULT_FOV_RAD = 0.9424777960769379  # 54 degrees by default
+
+
+
+def get_default_intrinsics(
+    fov_rad=DEFAULT_FOV_RAD,
+    aspect_ratio=1.0,
+):
+    if not isinstance(fov_rad, torch.Tensor):
+        fov_rad = torch.tensor(
+            [fov_rad] if isinstance(fov_rad, (int, float)) else fov_rad
+        )
+    if aspect_ratio >= 1.0:  # W >= H
+        focal_x = 0.5 / torch.tan(0.5 * fov_rad)
+        focal_y = focal_x * aspect_ratio
+    else:  # W < H
+        focal_y = 0.5 / torch.tan(0.5 * fov_rad)
+        focal_x = focal_y / aspect_ratio
+    intrinsics = focal_x.new_zeros((focal_x.shape[0], 3, 3))
+    intrinsics[:, torch.eye(3, device=focal_x.device, dtype=bool)] = torch.stack(
+        [focal_x, focal_y, torch.ones_like(focal_x)], dim=-1
+    )
+    intrinsics[:, :, -1] = torch.tensor(
+        [0.5, 0.5, 1.0], device=focal_x.device, dtype=focal_x.dtype
+    )
+    return intrinsics
+
+def to_hom(X):
+    # get homogeneous coordinates of the input
+    X_hom = torch.cat([X, torch.ones_like(X[..., :1])], dim=-1)
+    return X_hom
+
+
+def to_hom_pose(pose):
+    # get homogeneous coordinates of the input pose
+    if pose.shape[-2:] == (3, 4):
+        pose_hom = torch.eye(4, device=pose.device)[None].repeat(pose.shape[0], 1, 1)
+        pose_hom[:, :3, :] = pose
+        return pose_hom
+    return pose
+
+
+
+def get_image_grid(img_h, img_w):
+    # add 0.5 is VERY important especially when your img_h and img_w
+    # is not very large (e.g., 72)!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    y_range = torch.arange(img_h, dtype=torch.float32).add_(0.5)
+    x_range = torch.arange(img_w, dtype=torch.float32).add_(0.5)
+    Y, X = torch.meshgrid(y_range, x_range, indexing="ij")  # [H,W]
+    xy_grid = torch.stack([X, Y], dim=-1).view(-1, 2)  # [HW,2]
+    return to_hom(xy_grid)  # [HW,3]
+
+
+def img2cam(X, cam_intr):
+    return X @ cam_intr.inverse().transpose(-1, -2)
+
+
+def cam2world(X, pose):
+    X_hom = to_hom(X)
+    pose_inv = torch.linalg.inv(to_hom_pose(pose))[..., :3, :4]
+    return X_hom @ pose_inv.transpose(-1, -2)
+
+
+def get_center_and_ray(img_h, img_w, pose, intr):  # [HW,2]
+    # given the intrinsic/extrinsic matrices, get the camera center and ray directions]
+    # assert(opt.camera.model=="perspective")
+
+    # compute center and ray
+    grid_img = get_image_grid(img_h, img_w)  # [HW,3]
+    grid_3D_cam = img2cam(grid_img.to(intr.device), intr.float())  # [B,HW,3]
+    center_3D_cam = torch.zeros_like(grid_3D_cam)  # [B,HW,3]
+
+    # transform from camera to world coordinates
+    grid_3D = cam2world(grid_3D_cam, pose)  # [B,HW,3]
+    center_3D = cam2world(center_3D_cam, pose)  # [B,HW,3]
+    ray = grid_3D - center_3D  # [B,HW,3]
+
+    return center_3D, ray, grid_3D_cam
+
+def get_plucker_coordinates(
+    extrinsics_src,
+    extrinsics,
+    intrinsics=None,
+    fov_rad=DEFAULT_FOV_RAD,
+    target_size=[72, 72],
+):
+    # Support for batch dimension
+    has_batch_dim = len(extrinsics.shape) == 4
+    
+    if has_batch_dim:
+        # [B, N, 4, 4] -> reshape to handle batch
+        batch_size, num_cameras = extrinsics.shape[0:2]
+        extrinsics_flat = extrinsics.reshape(-1, *extrinsics.shape[2:])
+        
+        # Handle extrinsics_src appropriately
+        if len(extrinsics_src.shape) == 3:  # [B, 4, 4]
+            extrinsics_src_expanded = extrinsics_src.unsqueeze(1).expand(-1, num_cameras, -1, -1)
+            extrinsics_src_flat = extrinsics_src_expanded.reshape(-1, *extrinsics_src.shape[1:])
+        else:  # [4, 4] - single extrinsics_src for all batches
+            extrinsics_src_flat = extrinsics_src.expand(batch_size * num_cameras, -1, -1)
+        
+        # Handle intrinsics for batch
+        if intrinsics is None:
+            intrinsics = get_default_intrinsics(fov_rad).to(extrinsics.device)
+            intrinsics = intrinsics.expand(batch_size * num_cameras, -1, -1)
+        elif len(intrinsics.shape) == 3:  # [N, 3, 3]
+            if intrinsics.shape[0] == num_cameras:
+                intrinsics = intrinsics.expand(batch_size, -1, -1, -1).reshape(-1, *intrinsics.shape[1:])
+            else:
+                intrinsics = intrinsics.expand(batch_size * num_cameras, -1, -1)
+        elif len(intrinsics.shape) == 4:  # [B, N, 3, 3]
+            intrinsics = intrinsics.reshape(-1, *intrinsics.shape[2:])
+    else:
+        # Original behavior for non-batch input
+        extrinsics_flat = extrinsics
+        extrinsics_src_flat = extrinsics_src
+        if intrinsics is None:
+            intrinsics = get_default_intrinsics(fov_rad).to(extrinsics.device)
+    
+    # Process intrinsics normalization
+    if not (
+        torch.all(intrinsics[:, :2, -1] >= 0)
+        and torch.all(intrinsics[:, :2, -1] <= 1)
+    ):
+        intrinsics[:, :2] /= intrinsics.new_tensor(target_size).view(1, -1, 1) * 8
+    
+    # Ensure normalized intrinsics
+    assert (
+        torch.all(intrinsics[:, :2, -1] >= 0)
+        and torch.all(intrinsics[:, :2, -1] <= 1)
+    ), "Intrinsics should be expressed in resolution-independent normalized image coordinates."
+
+    c2w_src = torch.linalg.inv(extrinsics_src_flat)
+    # transform coordinates from the source camera's coordinate system to the coordinate system of the respective camera
+    extrinsics_rel = torch.einsum(
+        "vnm,vmp->vnp", extrinsics_flat, c2w_src
+    )
+
+    intrinsics[:, :2] *= extrinsics_flat.new_tensor(
+        [
+            target_size[1],  # w
+            target_size[0],  # h
+        ]
+    ).view(1, -1, 1)
+    
+    centers, rays, grid_cam = get_center_and_ray(
+        img_h=target_size[0],
+        img_w=target_size[1],
+        pose=extrinsics_rel[:, :3, :],
+        intr=intrinsics,
+    )
+
+    rays = torch.nn.functional.normalize(rays, dim=-1)
+    plucker = torch.cat((rays, torch.cross(centers, rays, dim=-1)), dim=-1)
+    plucker = plucker.permute(0, 2, 1).reshape(plucker.shape[0], -1, *target_size)
+    
+    # Reshape back to batch dimension if needed
+    if has_batch_dim:
+        plucker = plucker.reshape(batch_size, num_cameras, *plucker.shape[1:])
+    
+    return plucker
+
+
+def get_value_dict(
+    curr_imgs,
+    curr_imgs_clip,
+    curr_input_frame_indices,
+    curr_c2ws,
+    curr_Ks,
+    curr_input_camera_indices,
+    all_c2ws,
+    camera_scale,
+):
+    assert sorted(curr_input_camera_indices) == sorted(
+        range(len(curr_input_camera_indices))
+    )
+    H, W, T, F = curr_imgs.shape[-2], curr_imgs.shape[-1], len(curr_imgs), 8
+
+    value_dict = {}
+    value_dict["cond_frames_without_noise"] = curr_imgs_clip[curr_input_frame_indices]
+    value_dict["cond_frames"] = curr_imgs + 0.0 * torch.randn_like(curr_imgs)
+    value_dict["cond_frames_mask"] = torch.zeros(T, dtype=torch.bool)
+    value_dict["cond_frames_mask"][curr_input_frame_indices] = True
+    value_dict["cond_aug"] = 0.0
+
+    if curr_c2ws.shape[-1] == 3:
+        c2w = to_hom_pose(curr_c2ws.float())
+    else:
+        c2w = curr_c2ws
+    w2c = torch.linalg.inv(c2w)
+
+    # camera centering
+    ref_c2ws = all_c2ws
+    camera_dist_2med = torch.norm(
+        ref_c2ws[:, :3, 3] - ref_c2ws[:, :3, 3].median(0, keepdim=True).values,
+        dim=-1,
+    )
+    valid_mask = camera_dist_2med <= torch.clamp(
+        torch.quantile(camera_dist_2med, 0.97) * 10,
+        max=1e6,
+    )
+    c2w[:, :3, 3] -= ref_c2ws[valid_mask, :3, 3].mean(0, keepdim=True)
+    w2c = torch.linalg.inv(c2w)
+
+    # camera normalization
+    camera_dists = c2w[:, :3, 3].clone()
+    translation_scaling_factor = (
+        camera_scale
+        if torch.isclose(
+            torch.norm(camera_dists[0]),
+            torch.zeros(1),
+            atol=1e-5,
+        ).any()
+        else (camera_scale / torch.norm(camera_dists[0]))
+    )
+    w2c[:, :3, 3] *= translation_scaling_factor
+    c2w[:, :3, 3] *= translation_scaling_factor
+    value_dict["plucker_coordinate"] = get_plucker_coordinates(
+        extrinsics_src=w2c[0],
+        extrinsics=w2c,
+        intrinsics=curr_Ks.float().clone(),
+        target_size=(H // F, W // F),
+    )
+
+    value_dict["c2w"] = c2w
+    value_dict["K"] = curr_Ks
+    value_dict["camera_mask"] = torch.zeros(T, dtype=torch.bool)
+    value_dict["camera_mask"][curr_input_camera_indices] = True
+
+    return value_dict
+
+def parse_meta_data(file_path, image_height=288, image_width=512):
+    with open(file_path, 'r') as file:
+        lines = file.readlines()
+    
+    # First line is the video URL
+    video_url = lines[0].strip()
+    
+    line = lines[1]
+    data = line.strip().split()
+    # Construct the camera intrinsics matrix K
+    focal_length_x = float(data[1])
+    focal_length_y = float(data[2])
+    principal_point_x = float(data[3])
+    principal_point_y = float(data[4])
+    
+    
+
+    original_K = [
+        [focal_length_x, 0, principal_point_x],
+        [0, focal_length_y, principal_point_y],
+        [0, 0, 1]
+    ]
+    
+    K = [
+        [focal_length_x * image_width, 0, principal_point_x * image_width],
+        [0, focal_length_y * image_height, principal_point_y * image_height],
+        [0, 0, 1]
+    ]
+    
+    # Initialize a list to store frame data
+    timestamp_to_c2ws = {}
+    timestamps = []
+    # Process each frame line
+    for line in lines[1:]:
+        data = line.strip().split()
+        timestamp = int(data[0])
+        R_t = [float(x) for x in data[7:]]
+        P = [
+            R_t[0:4],
+            R_t[4:8],
+            R_t[8:12],
+            [0, 0, 0, 1]
+        ]
+        timestamp_to_c2ws[timestamp] = np.array(P)
+        timestamps.append(timestamp)
+    return timestamps, np.array(K), timestamp_to_c2ws, original_K
+
+
+def get_wh_with_fixed_shortest_side(w, h, size):
+    # size is smaller or equal to zero, we return original w h
+    if size is None or size <= 0:
+        return w, h
+    if w < h:
+        new_w = size
+        new_h = int(size * h / w)
+    else:
+        new_h = size
+        new_w = int(size * w / h)
+    return new_w, new_h
+
+def get_resizing_factor(
+    target_shape: Tuple[int, int],  # H, W
+    current_shape: Tuple[int, int],  # H, W
+    cover_target: bool = True,
+    # If True, the output shape will fully cover the target shape.
+    # If No, the target shape will fully cover the output shape.
+) -> float:
+    r_bound = target_shape[1] / target_shape[0]
+    aspect_r = current_shape[1] / current_shape[0]
+    if r_bound >= 1.0:
+        if cover_target:
+            if aspect_r >= r_bound:
+                factor = min(target_shape) / min(current_shape)
+            elif aspect_r < 1.0:
+                factor = max(target_shape) / min(current_shape)
+            else:
+                factor = max(target_shape) / max(current_shape)
+        else:
+            if aspect_r >= r_bound:
+                factor = max(target_shape) / max(current_shape)
+            elif aspect_r < 1.0:
+                factor = min(target_shape) / max(current_shape)
+            else:
+                factor = min(target_shape) / min(current_shape)
+    else:
+        if cover_target:
+            if aspect_r <= r_bound:
+                factor = min(target_shape) / min(current_shape)
+            elif aspect_r > 1.0:
+                factor = max(target_shape) / min(current_shape)
+            else:
+                factor = max(target_shape) / max(current_shape)
+        else:
+            if aspect_r <= r_bound:
+                factor = max(target_shape) / max(current_shape)
+            elif aspect_r > 1.0:
+                factor = min(target_shape) / max(current_shape)
+            else:
+                factor = min(target_shape) / min(current_shape)
+    return factor
+
+def transform_img_and_K(
+    image: torch.Tensor,
+    size: Union[int, Tuple[int, int]],
+    scale: float = 1.0,
+    center: Tuple[float, float] = (0.5, 0.5),
+    K: Union[torch.Tensor, np.ndarray, None] = None,
+    size_stride: int = 1,
+    mode: str = "crop",
+):
+    assert mode in [
+        "crop",
+        "pad",
+        "stretch",
+    ], f"mode should be one of ['crop', 'pad', 'stretch'], got {mode}"
+
+    h, w = image.shape[-2:]
+    if isinstance(size, (tuple, list)):
+        # => if size is a tuple or list, we first rescale to fully cover the `size`
+        # area and then crop the `size` area from the rescale image
+        W, H = size
+    else:
+        # => if size is int, we rescale the image to fit the shortest side to size
+        # => if size is None, no rescaling is applied
+        W, H = get_wh_with_fixed_shortest_side(w, h, size)
+    W, H = (
+        math.floor(W / size_stride + 0.5) * size_stride,
+        math.floor(H / size_stride + 0.5) * size_stride,
+    )
+
+    if mode == "stretch":
+        rh, rw = H, W
+    else:
+        rfs = get_resizing_factor(
+            (H, W),
+            (h, w),
+            cover_target=mode != "pad",
+        )
+        (rh, rw) = [int(np.ceil(rfs * s)) for s in (h, w)]
+
+    rh, rw = int(rh / scale), int(rw / scale)
+    image = torch.nn.functional.interpolate(
+        image, (rh, rw), mode="area", antialias=False
+    )
+
+    cy_center = int(center[1] * image.shape[-2])
+    cx_center = int(center[0] * image.shape[-1])
+    if mode != "pad":
+        ct = max(0, cy_center - H // 2)
+        cl = max(0, cx_center - W // 2)
+        ct = min(ct, image.shape[-2] - H)
+        cl = min(cl, image.shape[-1] - W)
+        image = TF.crop(image, top=ct, left=cl, height=H, width=W)
+        pl, pt = 0, 0
+    else:
+        pt = max(0, H // 2 - cy_center)
+        pl = max(0, W // 2 - cx_center)
+        pb = max(0, H - pt - image.shape[-2])
+        pr = max(0, W - pl - image.shape[-1])
+        image = TF.pad(
+            image,
+            [pl, pt, pr, pb],
+        )
+        cl, ct = 0, 0
+
+    if K is not None:
+        K = K.clone()
+        # K[:, :2, 2] += K.new_tensor([pl, pt])
+        if torch.all(K[:, :2, -1] >= 0) and torch.all(K[:, :2, -1] <= 1):
+            K[:, :2] *= K.new_tensor([rw, rh])[None, :, None]  # normalized K
+        else:
+            K[:, :2] *= K.new_tensor([rw / w, rh / h])[None, :, None]  # unnormalized K
+        K[:, :2, 2] += K.new_tensor([pl - cl, pt - ct])
+
+    return image, K
+
+
+def load_img_and_K(
+    image_path_or_size: Union[str, torch.Size],
+    size: Optional[Union[int, Tuple[int, int]]],
+    scale: float = 1.0,
+    center: Tuple[float, float] = (0.5, 0.5),
+    K: Union[torch.Tensor, np.ndarray, None] = None,
+    size_stride: int = 1,
+    center_crop: bool = False,
+    image_as_tensor: bool = True,
+    context_rgb: Union[np.ndarray, None] = None,
+    device: str = "cuda",
+):
+    if isinstance(image_path_or_size, torch.Size):
+        image = Image.new("RGBA", image_path_or_size[::-1])
+    else:
+        image = Image.open(image_path_or_size).convert("RGBA")
+
+    w, h = image.size
+    if size is None:
+        size = (w, h)
+
+    image = np.array(image).astype(np.float32) / 255
+    if image.shape[-1] == 4:
+        rgb, alpha = image[:, :, :3], image[:, :, 3:]
+        if context_rgb is not None:
+            image = rgb * alpha + context_rgb * (1 - alpha)
+        else:
+            image = rgb * alpha + (1 - alpha)
+    image = image.transpose(2, 0, 1)
+    image = torch.from_numpy(image).to(dtype=torch.float32)
+    image = image.unsqueeze(0)
+
+    if isinstance(size, (tuple, list)):
+        # => if size is a tuple or list, we first rescale to fully cover the `size`
+        # area and then crop the `size` area from the rescale image
+        W, H = size
+    else:
+        # => if size is int, we rescale the image to fit the shortest side to size
+        # => if size is None, no rescaling is applied
+        W, H = get_wh_with_fixed_shortest_side(w, h, size)
+    W, H = (
+        math.floor(W / size_stride + 0.5) * size_stride,
+        math.floor(H / size_stride + 0.5) * size_stride,
+    )
+
+    rfs = get_resizing_factor((math.floor(H * scale), math.floor(W * scale)), (h, w))
+    resize_size = rh, rw = [int(np.ceil(rfs * s)) for s in (h, w)]
+    image = torch.nn.functional.interpolate(
+        image, resize_size, mode="area", antialias=False
+    )
+    if scale < 1.0:
+        pw = math.ceil((W - resize_size[1]) * 0.5)
+        ph = math.ceil((H - resize_size[0]) * 0.5)
+        image = F.pad(image, (pw, pw, ph, ph), "constant", 1.0)
+
+    cy_center = int(center[1] * image.shape[-2])
+    cx_center = int(center[0] * image.shape[-1])
+    if center_crop:
+        side = min(H, W)
+        ct = max(0, cy_center - side // 2)
+        cl = max(0, cx_center - side // 2)
+        ct = min(ct, image.shape[-2] - side)
+        cl = min(cl, image.shape[-1] - side)
+        image = TF.crop(image, top=ct, left=cl, height=side, width=side)
+    else:
+        ct = max(0, cy_center - H // 2)
+        cl = max(0, cx_center - W // 2)
+        ct = min(ct, image.shape[-2] - H)
+        cl = min(cl, image.shape[-1] - W)
+        image = TF.crop(image, top=ct, left=cl, height=H, width=W)
+
+    if K is not None:
+        K = K.clone()
+        if torch.all(K[:2, -1] >= 0) and torch.all(K[:2, -1] <= 1):
+            K[:2] *= K.new_tensor([rw, rh])[:, None]  # normalized K
+        else:
+            K[:2] *= K.new_tensor([rw / w, rh / h])[:, None]  # unnormalized K
+        K[:2, 2] -= K.new_tensor([cl, ct])
+
+    if image_as_tensor:
+        # tensor of shape (1, 3, H, W) with values ranging from (-1, 1)
+        image = image.to(device) * 2.0 - 1.0
+    else:
+        # PIL Image with values ranging from (0, 255)
+        image = image.permute(0, 2, 3, 1).numpy()[0]
+        image = Image.fromarray((image * 255).astype(np.uint8))
+    return image, K
+
+
+
+
+def geodesic_distance(extrinsic1: Union[np.ndarray, torch.Tensor],
+                      extrinsic2: Union[np.ndarray, torch.Tensor],
+                      weight_translation: float = 0.01,):
+    """
+    Computes the geodesic distance between two camera poses in SE(3).
+    
+    Parameters:
+        extrinsic1 (Union[np.ndarray, torch.Tensor]): 4x4 extrinsic matrix of the first pose.
+        extrinsic2 (Union[np.ndarray, torch.Tensor]): 4x4 extrinsic matrix of the second pose.
+
+    Returns:
+        Union[float, torch.Tensor]: Geodesic distance between the two poses.
+    """
+    if torch.is_tensor(extrinsic1):
+        # Extract the rotation and translation components
+        R1 = extrinsic1[:3, :3]
+        t1 = extrinsic1[:3, 3]
+        R2 = extrinsic2[:3, :3]
+        t2 = extrinsic2[:3, 3]
+        
+        # Compute the translation distance (Euclidean distance)
+        translation_distance = torch.norm(t1 - t2)
+        
+        # Compute the relative rotation matrix
+        R_relative = torch.matmul(R1.T, R2)
+        
+        # Compute the angular distance from the trace of the relative rotation matrix
+        trace_value = torch.trace(R_relative)
+        # Clamp the trace value to avoid numerical issues
+        trace_value = torch.clamp(trace_value, -1.0, 3.0)
+        angular_distance = torch.acos((trace_value - 1) / 2)
+        
+    else:
+        # Extract the rotation and translation components
+        R1 = extrinsic1[:3, :3]
+        t1 = extrinsic1[:3, 3]
+        R2 = extrinsic2[:3, :3]
+        t2 = extrinsic2[:3, 3]
+        
+        # Compute the translation distance (Euclidean distance)
+        translation_distance = np.linalg.norm(t1 - t2)
+        
+        # Compute the relative rotation matrix
+        R_relative = np.dot(R1.T, R2)
+        
+        # Compute the angular distance from the trace of the relative rotation matrix
+        trace_value = np.trace(R_relative)
+        # Clamp the trace value to avoid numerical issues
+        trace_value = np.clip(trace_value, -1.0, 3.0)
+        angular_distance = np.arccos((trace_value - 1) / 2)
+    
+    # Combine the two distances
+    geodesic_dist = translation_distance*weight_translation + angular_distance
+    
+    return geodesic_dist
+
+
+def inverse_geodesic_distance(extrinsic1,
+                              extrinsic2,
+                              weight_translation=0.01):
+    """
+    Computes the inverse geodesic distance between two camera poses in SE(3).
+    
+    Parameters:
+        extrinsic1 (np.ndarray): 4x4 extrinsic matrix of the first pose.
+        extrinsic2 (np.ndarray): 4x4 extrinsic matrix of the second pose.
+
+    Returns:
+        float: Inverse geodesic distance between the two poses.
+    """
+    # Compute the geodesic distance
+    geodesic_dist = geodesic_distance(extrinsic1, extrinsic2, weight_translation)
+    
+    # Compute the inverse geodesic distance
+    inverse_geodesic_dist = 1.0 / (geodesic_dist + 1e-6)
+    
+    return inverse_geodesic_dist
+
+
+
+def average_camera_pose(camera_poses):
+    """
+    Compute a better average of camera poses in SE(3).
+    
+    Args:
+        camera_poses: List or array of camera poses, each a 4x4 matrix
+        
+    Returns:
+        Average camera pose as a 4x4 matrix
+    """
+    # Extract rotation and translation components
+    rotations = camera_poses[:, :3, :3].detach().cpu().numpy()
+    translations = camera_poses[:, :3, 3].detach().cpu().numpy()
+    
+    # Average translation with simple mean
+    avg_translation = np.mean(translations, axis=0)
+    
+    # Convert rotations to quaternions for better averaging
+    import scipy.spatial.transform as transform
+    quats = [transform.Rotation.from_matrix(R).as_quat() for R in rotations]
+    
+    # Ensure quaternions are in the same hemisphere to avoid issues with averaging
+    for i in range(1, len(quats)):
+        if np.dot(quats[0], quats[i]) < 0:
+            quats[i] = -quats[i]
+    
+    # Average the quaternions and convert back to rotation matrix
+    avg_quat = np.mean(quats, axis=0)
+    avg_quat = avg_quat / np.linalg.norm(avg_quat)  # Normalize
+    avg_rotation = transform.Rotation.from_quat(avg_quat).as_matrix()
+    
+    # Construct the average pose
+    avg_pose = np.eye(4)
+    avg_pose[:3, :3] = avg_rotation
+    avg_pose[:3, 3] = avg_translation
+    
+    return avg_pose
+        
+
+
+
+def encode_image(
+    image,
+    image_encoder,
+    device,
+    dtype,
+) -> torch.Tensor:
+
+
+    image = image.to(device=device, dtype=dtype)
+    image_embeddings = image_encoder(image)
+
+
+    return image_embeddings
+
+
+def encode_vae_image(
+    image,
+    vae,
+    device,
+    dtype,
+
+):  
+    image = image.to(device=device, dtype=dtype)
+    image_latents = vae.encode(image, 1)
+
+
+    return image_latents
+
+
+
+
+def do_sample(
+    model,
+    ae,
+    denoiser,
+    sampler,
+    c,
+    uc,
+    c2w,
+    K,
+    cond_frames_mask,
+    H=576,
+    W=768,
+    C=4,
+    F=8,
+    T=8,
+    cfg=2.0,
+    decoding_t=1,
+    verbose=True,
+    global_pbar=None,
+    return_latents=False,
+    device: str = "cuda",
+    **_,
+):
+
+    num_samples = [1, T]
+    with torch.inference_mode(), torch.autocast("cuda"):
+
+        additional_model_inputs = {"num_frames": T}
+        additional_sampler_inputs = {
+            "c2w": c2w.to("cuda"),
+            "K": K.to("cuda"),
+            "input_frame_mask": cond_frames_mask.to("cuda"),
+        }
+        if global_pbar is not None:
+            additional_sampler_inputs["global_pbar"] = global_pbar
+
+        shape = (math.prod(num_samples), C, H // F, W // F)
+        randn = torch.randn(shape).to(device)
+
+        samples_z = sampler(
+            lambda input, sigma, c: denoiser(
+                model,
+                input,
+                sigma,
+                c,
+                **additional_model_inputs,
+            ),
+            randn,
+            scale=cfg,
+            cond=c,
+            uc=uc,
+            verbose=verbose,
+            **additional_sampler_inputs,
+        )
+        if samples_z is None:
+            return
+
+        samples = ae.decode(samples_z, decoding_t)
+    if return_latents:
+        return samples, samples_z
+    
+    return samples
+
+
+def decode_output(
+    samples,
+    T,
+    indices=None,
+):
+    # decode model output into dict if it is not
+    if isinstance(samples, dict):
+        # model with postprocessor and outputs dict q``
+        for sample, value in samples.items():
+            if isinstance(value, torch.Tensor):
+                value = value.detach().cpu()
+            elif isinstance(value, np.ndarray):
+                value = torch.from_numpy(value)
+            else:
+                value = torch.tensor(value)
+
+            if indices is not None and value.shape[0] == T:
+                value = value[indices]
+            samples[sample] = value
+    else:
+        # model without postprocessor and outputs tensor (rgb)
+        samples = samples.detach().cpu()
+
+        if indices is not None and samples.shape[0] == T:
+            samples = samples[indices]
+        samples = {"samples-rgb/image": samples}
+
+    return samples
+
+def select_frames(timestamps, min_num_frames=2, skip_frame=10, random_start=False):
+    """
+    Select frames from a video sequence based on defined criteria.
+    
+    Args:
+        timestamps: List of timestamps for the frames
+        min_num_frames: Minimum number of frames required
+        skip_frame: Number of frames to skip between selections
+        random_start: If True, start from a random frame
+        
+    Returns:
+        tuple: (selected_frame_indices, selected_frame_timestamps) or (None, None) if criteria not met
+    """
+    
+    num_frames = len(timestamps)
+    if num_frames < min_num_frames:
+        print(f"[Worker PID={os.getpid()}] Episode has less than {min_num_frames} frames")
+        return None, None
+
+    # Decide on start/end frames
+    if num_frames < 2:
+        print(f"[Worker PID={os.getpid()}] Episode has less than 2 frames")
+        return None, None
+    elif num_frames < skip_frame:
+        cur_skip_frame = num_frames - 1
+    else:
+        cur_skip_frame = skip_frame
+
+    if random_start:
+        start_frame = np.random.randint(0, skip_frame)
+    else:
+        start_frame = 0
+
+    # Gather frame indices
+    selected_frame_indices = list(range(start_frame, num_frames, cur_skip_frame))
+    selected_frame_timestamps = [timestamps[i] for i in selected_frame_indices]
+    
+    return selected_frame_indices, selected_frame_timestamps
+
+
+def tensor2im(input_image, imtype=np.uint8):
+    if not isinstance(input_image, np.ndarray):
+        if isinstance(input_image, torch.Tensor):  # get the data from a variable
+            image_tensor = input_image.data
+        else:
+            return input_image
+        image_numpy = image_tensor[0].clamp(0.0, 1.0).cpu().float().numpy()  # convert it into a numpy array
+        image_numpy = np.transpose(image_numpy, (1, 2, 0)) * 255.0  # post-processing: tranpose and scaling
+    else:  # if it is a numpy array, do nothing
+        image_numpy = input_image
+    return image_numpy.astype(imtype)
+
+
+class LatentStorer:
+    def __init__(self):
+        self.latent = None
+
+    def __call__(self, i, t, latent):
+        self.latent = latent
+
+
+def sobel_filter(disp, mode="sobel", beta=10.0):
+    sobel_grad = kornia.filters.spatial_gradient(disp, mode=mode, normalized=False)
+    sobel_mag = torch.sqrt(sobel_grad[:, :, 0, Ellipsis] ** 2 + sobel_grad[:, :, 1, Ellipsis] ** 2)
+    alpha = torch.exp(-1.0 * beta * sobel_mag).detach()
+
+    return alpha
+
+
+def apply_colormap(image, cmap="viridis"):
+    colormap = cm.get_cmap(cmap)
+    colormap = torch.tensor(colormap.colors).to(image.device)
+    image_long = (image * 255).long()
+    image_long_min = torch.min(image_long)
+    image_long_max = torch.max(image_long)
+    assert image_long_min >= 0, f"the min value is {image_long_min}"
+    assert image_long_max <= 255, f"the max value is {image_long_max}"
+    return colormap[image_long[..., 0]]
+
+
+def apply_depth_colormap(
+    depth,
+    near_plane=None,
+    far_plane=None,
+    cmap="viridis",
+):
+    near_plane = near_plane or float(torch.min(depth))
+    far_plane = far_plane or float(torch.max(depth))
+
+    depth = (depth - near_plane) / (far_plane - near_plane + 1e-10)
+    depth = torch.clip(depth, 0, 1)
+
+    colored_image = apply_colormap(depth, cmap=cmap)
+
+    return colored_image
+
+
+def save_video(video, path, fps=10):
+    video = video.permute(0, 2, 3, 1)
+    video_codec = "libx264"
+    video_options = {
+        "crf": "23",  # Constant Rate Factor (lower value = higher quality, 18 is a good balance)
+        "preset": "slow",
+    }
+    write_video(str(path), video, fps=fps, video_codec=video_codec, options=video_options)
+
+
+
+    
+    
+def visualize_camera_poses(camera_poses, axis_length=0.1):
+    """
+    Visualizes a set of camera poses in 3D using Matplotlib.
+
+    Parameters
+    ----------
+    camera_poses : np.ndarray
+        An array of shape (N, 4, 4) containing N camera poses.
+        Each pose is a 4x4 transformation matrix.
+    axis_length : float
+        Length of the camera axes to draw.
+    """
+    if isinstance(camera_poses, torch.Tensor):
+        camera_poses = camera_poses.detach().cpu().numpy()
+    # Create a 3D figure
+    fig = plt.figure()
+    ax = fig.add_subplot(111, projection='3d')
+
+    # Iterate over all camera poses
+    for i in range(camera_poses.shape[0]):
+        # Extract rotation (R) and translation (t)
+        R = camera_poses[i][:3, :3]
+        t = camera_poses[i][:3, 3]
+
+        # Plot the camera center
+        ax.scatter(t[0], t[1], t[2], c='k', marker='o', s=20)
+
+        # Define the end-points of each local axis
+        x_axis_end = t + R[:, 0] * axis_length
+        y_axis_end = t + R[:, 1] * axis_length
+        z_axis_end = t + R[:, 2] * axis_length
+
+        # Draw the axes as lines
+        ax.plot([t[0], x_axis_end[0]], [t[1], x_axis_end[1]],
+                [t[2], x_axis_end[2]], color='r')  # X-axis (red)
+        ax.plot([t[0], y_axis_end[0]], [t[1], y_axis_end[1]],
+                [t[2], y_axis_end[2]], color='g')  # Y-axis (green)
+        ax.plot([t[0], z_axis_end[0]], [t[1], z_axis_end[1]],
+                [t[2], z_axis_end[2]], color='b')  # Z-axis (blue)
+
+    # Make axes have equal scale
+    set_axes_equal(ax)
+
+    ax.set_title("Camera Poses Visualization")
+    ax.set_xlabel("X")
+    ax.set_ylabel("Y")
+    ax.set_zlabel("Z")
+    plt.show()
+
+def set_axes_equal(ax):
+    """
+    Make axes of 3D plot have equal scale so that spheres appear as spheres, cubes as cubes, etc.
+    This is a workaround to Matplotlib's set_aspect('equal') which is not supported in 3D.
+    """
+    x_limits = ax.get_xlim3d()
+    y_limits = ax.get_ylim3d()
+    z_limits = ax.get_zlim3d()
+
+    x_range = x_limits[1] - x_limits[0]
+    y_range = y_limits[1] - y_limits[0]
+    z_range = z_limits[1] - z_limits[0]
+
+    max_range = max(x_range, y_range, z_range)
+    x_middle = np.mean(x_limits)
+    y_middle = np.mean(y_limits)
+    z_middle = np.mean(z_limits)
+
+    ax.set_xlim3d([x_middle - 0.5 * max_range, x_middle + 0.5 * max_range])
+    ax.set_ylim3d([y_middle - 0.5 * max_range, y_middle + 0.5 * max_range])
+    ax.set_zlim3d([z_middle - 0.5 * max_range, z_middle + 0.5 * max_range])
+
+
+def tensor_to_pil(image):
+    if isinstance(image, torch.Tensor):
+        if image.dim() == 4:
+            image = image.squeeze(0)
+        image = image.permute(1, 2, 0).detach().cpu().numpy()
+        
+        # Detect the range of the input tensor
+        if image.min() < -0.1:  # If we have negative values, assume [-1, 1] range
+            image = (image + 1) / 2.0  # Convert from [-1, 1] to [0, 1]
+        # Otherwise, assume it's already in [0, 1] range
+            
+        image = (image * 255)
+        image = np.clip(image, 0, 255)
+        image = image.astype(np.uint8)
+    return Image.fromarray(image)
+
+
+
+def center_crop_pil_image(input_image, target_width=1024, target_height=576):
+    w, h = input_image.size
+    h_ratio = h / target_height
+    w_ratio = w / target_width
+
+    if h_ratio > w_ratio:
+        h = int(h / w_ratio)
+        if h < target_height:
+            h = target_height
+        input_image = input_image.resize((target_width, h), Image.Resampling.LANCZOS)
+    else:
+        w = int(w / h_ratio)
+        if w < target_width:
+            w = target_width
+        input_image = input_image.resize((w, target_height), Image.Resampling.LANCZOS)
+
+    return ImageOps.fit(input_image, (target_width, target_height), Image.BICUBIC)
+
+def resize_pil_image(img, long_edge_size):
+    S = max(img.size)
+    if S > long_edge_size:
+        interp = PIL.Image.LANCZOS
+    elif S <= long_edge_size:
+        interp = PIL.Image.BICUBIC
+    new_size = tuple(int(round(x*long_edge_size/S)) for x in img.size)
+    return img.resize(new_size, interp)
+
+def visualize_surfels(
+        surfels, 
+        draw_normals=False, 
+        normal_scale=20, 
+        disk_resolution=16, 
+        disk_alpha=0.5
+    ):
+        """
+        Visualize surfels as 2D disks oriented by their normals in 3D using matplotlib.
+
+        Args:
+            surfels (list of Surfel): Each Surfel has at least:
+                - position: (x, y, z)
+                - normal: (nx, ny, nz)
+                - radius: scalar
+                - color: (R, G, B) in [0..255] (optional)
+            draw_normals (bool): If True, draws the surfel normals as quiver arrows.
+            normal_scale (float): Scale factor for the normal arrows.
+            disk_resolution (int): Number of segments to approximate each disk.
+            disk_alpha (float): Alpha (transparency) for the filled disks.
+        """
+
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+        
+        # Prepare arrays for optional quiver (if draw_normals=True)
+        positions = []
+        normals = []
+
+        # We'll accumulate 3D polygons in a list for Poly3DCollection
+        polygons = []
+        polygon_colors = []
+
+        for s in surfels:
+            # --- Extract surfel data ---
+            
+            position = s.position
+            normal = s.normal
+            radius = s.radius
+            
+            if isinstance(position, torch.Tensor):
+                x, y, z = position.detach().cpu().numpy()
+                nx, ny, nz = normal.detach().cpu().numpy()
+                radius = radius.detach().cpu().numpy()
+            else:
+                x, y, z = position
+                nx, ny, nz = normal
+                radius = radius
+            
+            
+            # Convert color from [0..255] to [0..1], or use default
+            if s.color is None:
+                color = (0.2, 0.6, 1.0)  # Light blue
+            else:
+                r, g, b = s.color
+                color = (r/255.0, g/255.0, b/255.0)
+
+            # --- Build local coordinate axes for the disk ---
+            normal = np.array([nx, ny, nz], dtype=float)
+            norm_len = np.linalg.norm(normal)
+            # Skip degenerate normals to avoid nan
+            if norm_len < 1e-12:
+                continue
+            normal /= norm_len
+
+            # Pick an 'up' vector that is not too close to the normal
+            # so we can build a tangent plane
+            up = np.array([0, 0, 1], dtype=float)
+            if abs(normal.dot(up)) > 0.9:
+                up = np.array([0, 1, 0], dtype=float)
+
+            # xAxis = normal x up
+            xAxis = np.cross(normal, up)
+            xAxis /= np.linalg.norm(xAxis)
+            # yAxis = normal x xAxis
+            yAxis = np.cross(normal, xAxis)
+            yAxis /= np.linalg.norm(yAxis)
+
+            # --- Create a circle of 'disk_resolution' segments in local 2D coords ---
+            angles = np.linspace(0, 2*np.pi, disk_resolution, endpoint=False)
+            circle_points_3d = []
+            for theta in angles:
+                # local 2D circle: (r*cosθ, r*sinθ)
+                px = radius * np.cos(theta)
+                py = radius * np.sin(theta)
+
+                # transform to 3D world space: position + px*xAxis + py*yAxis
+                world_pt = np.array([x, y, z]) + px * xAxis + py * yAxis
+                circle_points_3d.append(world_pt)
+
+            # We have a list of [x, y, z]. For a filled polygon, Poly3DCollection
+            # wants them as a single Nx3 array.
+            circle_points_3d = np.array(circle_points_3d)
+            polygons.append(circle_points_3d)
+            polygon_colors.append(color)
+
+            # Collect positions and normals for quiver (if used)
+            positions.append([x, y, z])
+            normals.append(normal)
+
+        # --- Draw the disks as polygons ---
+        poly_collection = Poly3DCollection(
+            polygons,
+            facecolors=polygon_colors,
+            edgecolors='k',  # black edge
+            linewidths=0.5,
+            alpha=disk_alpha
+        )
+        ax.add_collection3d(poly_collection)
+
+        # --- Optionally draw normal vectors (quiver) ---
+        if draw_normals and len(positions) > 0:
+            X = [p[0] for p in positions]
+            Y = [p[1] for p in positions]
+            Z = [p[2] for p in positions]
+
+            Nx = [n[0] for n in normals]
+            Ny = [n[1] for n in normals]
+            Nz = [n[2] for n in normals]
+
+            # Note: If your scene is large, you may want to increase `length`.
+            ax.quiver(
+                X, Y, Z, 
+                Nx, Ny, Nz, 
+                length=normal_scale, 
+                color='red', 
+                normalize=True
+            )
+
+        # --- Axis labels, aspect ratio, etc. ---
+        ax.set_xlabel('X')
+        ax.set_ylabel('Y')
+        ax.set_zlabel('Z')
+        try:
+            ax.set_box_aspect((1, 1, 1))
+        except AttributeError:
+            pass  # older MPL versions
+
+        plt.title("Surfels as Disks (Oriented by Normal)")
+        plt.show()
+    
+def visualize_pointcloud(
+        points,
+        colors=None,
+        title='Point Cloud',
+        point_size=1,
+        alpha=1.0
+    ):
+        """
+        Visualize a 3D point cloud using Matplotlib, with an option to provide
+        per-point RGB or RGBA colors, ensuring equal scaling for the x, y, and z axes.
+
+        Parameters
+        ----------
+        points : np.ndarray or torch.Tensor
+            A numpy array (or Tensor) of shape [N, 3] where each row is a 3D point (x, y, z).
+        colors : None, str, or np.ndarray
+            - If None, a default single color ('blue') is used.
+            - If a string, that color will be used for all points.
+            - If a numpy array, it should have shape [N, 3] or [N, 4], where each row
+            corresponds to the color of the matching point in `points`.
+            Values should be in the range [0, 1] if using floats.
+        title : str, optional
+            The title of the plot. Default is 'Point Cloud'.
+        point_size : float, optional
+            The size of the points in the scatter plot. Default is 1.
+        alpha : float, optional
+            The overall alpha (transparency) value for the points. Default is 1.0.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> # Generate random points
+        >>> pts = np.random.rand(1000, 3)
+        >>> # Generate random colors in [0,1]
+        >>> cols = np.random.rand(1000, 3)
+        >>> visualize_pointcloud(pts, colors=cols, title="Random Point Cloud with Colors")
+        """
+
+        # Convert Torch tensors to NumPy arrays if needed
+        if isinstance(points, torch.Tensor):
+            points = points.detach().cpu().numpy()
+        if isinstance(colors, torch.Tensor):
+            colors = colors.detach().cpu().numpy()
+
+        # Flatten points if they are in a higher-dimensional array
+        if len(points.shape) > 2:
+            points = points.reshape(-1, 3)
+        if colors is not None and isinstance(colors, np.ndarray) and len(colors.shape) > 2:
+            colors = colors.reshape(-1, colors.shape[-1])
+
+        # Validate shape of points
+        if points.shape[1] != 3:
+            raise ValueError("`points` array must have shape [N, 3].")
+
+        # Validate or set colors
+        if colors is None:
+            colors = 'blue'
+        elif isinstance(colors, np.ndarray):
+            colors = np.asarray(colors)
+            if colors.shape[0] != points.shape[0]:
+                raise ValueError(
+                    "Colors array length must match the number of points."
+                )
+            if colors.shape[1] not in [3, 4]:
+                raise ValueError(
+                    "Colors array must have shape [N, 3] or [N, 4]."
+                )
+
+        # Extract coordinates
+        x = points[:, 0]
+        y = points[:, 1]
+        z = points[:, 2]
+
+        # Create a 3D figure
+        fig = plt.figure(figsize=(8, 6))
+        ax = fig.add_subplot(111, projection='3d')
+
+        # Scatter plot with specified or per-point colors
+        ax.scatter(x, y, z, c=colors, s=point_size, alpha=alpha)
+
+        # Set labels and title
+        ax.set_xlabel('X')
+        ax.set_ylabel('Y')
+        ax.set_zlabel('Z')
+        ax.set_title(title)
+
+        # Ensure all axes have the same scale
+        max_range = np.array([x.max() - x.min(), 
+                            y.max() - y.min(),
+                            z.max() - z.min()]).max() / 2.0
+        mid_x = (x.max() + x.min()) * 0.5
+        mid_y = (y.max() + y.min()) * 0.5
+        mid_z = (z.max() + z.min()) * 0.5
+        
+        ax.set_xlim(mid_x - max_range, mid_x + max_range)
+        ax.set_ylim(mid_y - max_range, mid_y + max_range)
+        ax.set_zlim(mid_z - max_range, mid_z + max_range)
+
+        # Adjust viewing angle for better visibility
+        ax.view_init(elev=20., azim=30)
+
+        plt.tight_layout()
+        plt.show()    
+
+def visualize_depth(depth_image,
+                    file_name="rendered_depth.png",
+                    visualization_dir="visualization",
+                    size=(512, 288)):
+        """
+        Visualize a depth map as a grayscale image.
+
+        Parameters
+        ----------
+        depth_image : np.ndarray
+            A 2D array of depth values.
+        visualization_dir : str
+            The directory to save the visualization image.
+
+        Returns
+        -------
+        PIL.Image
+            The visualization image.
+        """
+        # Normalize the depth values for visualization
+        depth_min = depth_image.min()
+        depth_max = depth_image.max()
+        print(f"Depth min: {depth_min}, max: {depth_max}")
+        depth_image  = np.clip(depth_image, 0, depth_max)
+        depth_vis = (depth_image - depth_min) / (depth_max - depth_min)
+        depth_vis = (depth_vis * 255).astype(np.uint8)
+
+        # Convert the depth image to a PIL image
+        depth_vis_img = Image.fromarray(depth_vis, mode='L')
+
+        depth_vis_img = depth_vis_img.resize(size, Image.NEAREST)
+        # Save the visualization image
+        depth_vis_img.save(os.path.join(visualization_dir, file_name))
+
+        return depth_vis_img
+
+class Surfel:
+    def __init__(self, position, normal, radius=1.0, color=None):
+        """
+        position: (x, y, z)
+        normal:   (nx, ny, nz)
+        radius:   scalar
+        color:    (r, g, b) or None
+        """
+        self.position = position
+        self.normal = normal
+        self.radius = radius
+        self.color = color
+
+    def __repr__(self):
+        return (f"Surfel(position={self.position}, "
+                f"normal={self.normal}, radius={self.radius}, "
+                f"color={self.color})")
+
+
+
+class Octree:
+    def __init__(self, points, indices=None, bbox=None, max_points=10):
+        self.points = points
+        if indices is None:
+            indices = np.arange(points.shape[0])
+        self.indices = indices
+
+
+        if bbox is None:
+            min_bound = points.min(axis=0)
+            max_bound = points.max(axis=0)
+            center = (min_bound + max_bound) / 2
+            half_size = np.max(max_bound - min_bound) / 2
+            bbox = (center, half_size)
+        self.center, self.half_size = bbox
+
+        self.children = []  # 存储子节点
+        self.max_points = max_points
+
+        if len(self.indices) > self.max_points:
+            self.subdivide()
+
+    def subdivide(self):
+
+        cx, cy, cz = self.center
+        hs = self.half_size / 2
+
+        offsets = np.array([[dx, dy, dz] for dx in (-hs, hs) 
+                                       for dy in (-hs, hs) 
+                                       for dz in (-hs, hs)])
+        for offset in offsets:
+            child_center = self.center + offset
+            child_indices = []
+  
+            for idx in self.indices:
+                p = self.points[idx]
+                if np.all(np.abs(p - child_center) <= hs):
+                    child_indices.append(idx)
+            child_indices = np.array(child_indices)
+            if len(child_indices) > 0:
+                child = Octree(self.points, indices=child_indices, bbox=(child_center, hs), max_points=self.max_points)
+                self.children.append(child)
+  
+        self.indices = None
+
+    def sphere_intersects_node(self, center, r):
+
+        diff = np.abs(center - self.center)
+        max_diff = diff - self.half_size
+        max_diff = np.maximum(max_diff, 0)
+        dist_sq = np.sum(max_diff**2)
+        return dist_sq <= r*r
+
+    def query_ball_point(self, point, r):
+
+        results = []
+        if not self.sphere_intersects_node(point, r):
+            return results
+
+        if len(self.children) == 0:
+            if self.indices is not None:
+                for idx in self.indices:
+                    if np.linalg.norm(self.points[idx] - point) <= r:
+                        results.append(idx)
+            return results
+        else:
+            for child in self.children:
+                results.extend(child.query_ball_point(point, r))
+            return results
+        
diff --git a/utils/vis/__pycache__/surfel_viewer.cpython-310.pyc b/utils/vis/__pycache__/surfel_viewer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb623ca5c185b3c6a60ae74d02517a69562b46c0
Binary files /dev/null and b/utils/vis/__pycache__/surfel_viewer.cpython-310.pyc differ
diff --git a/utils/vis/get_visualization_surfels.py b/utils/vis/get_visualization_surfels.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6a897a91247664b09a0f8ef3efd104135d6fcf
--- /dev/null
+++ b/utils/vis/get_visualization_surfels.py
@@ -0,0 +1,837 @@
+import sys
+import json
+import numpy as np
+from PIL import Image
+
+
+from torch.amp import autocast
+import torch
+import copy
+from torch.nn import functional as F
+
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+
+sys.path.append("./extern/dust3r")
+from dust3r.inference import inference, load_model
+from dust3r.utils.image import load_images
+from dust3r.image_pairs import make_pairs
+from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
+
+
+
+def visualize_surfels(
+        surfels, 
+        draw_normals=False, 
+        normal_scale=20, 
+        disk_resolution=16, 
+        disk_alpha=0.5
+    ):
+        """
+        Visualize surfels as 2D disks oriented by their normals in 3D using matplotlib.
+
+        Args:
+            surfels (list of Surfel): Each Surfel has at least:
+                - position: (x, y, z)
+                - normal: (nx, ny, nz)
+                - radius: scalar
+                - color: (R, G, B) in [0..255] (optional)
+            draw_normals (bool): If True, draws the surfel normals as quiver arrows.
+            normal_scale (float): Scale factor for the normal arrows.
+            disk_resolution (int): Number of segments to approximate each disk.
+            disk_alpha (float): Alpha (transparency) for the filled disks.
+        """
+
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+        
+        # Prepare arrays for optional quiver (if draw_normals=True)
+        positions = []
+        normals = []
+
+        # We'll accumulate 3D polygons in a list for Poly3DCollection
+        polygons = []
+        polygon_colors = []
+
+        for s in surfels:
+            # --- Extract surfel data ---
+            
+            position = s.position
+            normal = s.normal
+            radius = s.radius
+            
+            if isinstance(position, torch.Tensor):
+                x, y, z = position.detach().cpu().numpy()
+                nx, ny, nz = normal.detach().cpu().numpy()
+                radius = radius.detach().cpu().numpy()
+            else:
+                x, y, z = position
+                nx, ny, nz = normal
+                radius = radius
+            
+            
+            # Convert color from [0..255] to [0..1], or use default
+            if s.color is None:
+                color = (0.2, 0.6, 1.0)  # Light blue
+            else:
+                r, g, b = s.color
+                color = (r/255.0, g/255.0, b/255.0)
+
+            # --- Build local coordinate axes for the disk ---
+            normal = np.array([nx, ny, nz], dtype=float)
+            norm_len = np.linalg.norm(normal)
+            # Skip degenerate normals to avoid nan
+            if norm_len < 1e-12:
+                continue
+            normal /= norm_len
+
+            # Pick an 'up' vector that is not too close to the normal
+            # so we can build a tangent plane
+            up = np.array([0, 0, 1], dtype=float)
+            if abs(normal.dot(up)) > 0.9:
+                up = np.array([0, 1, 0], dtype=float)
+
+            # xAxis = normal x up
+            xAxis = np.cross(normal, up)
+            xAxis /= np.linalg.norm(xAxis)
+            # yAxis = normal x xAxis
+            yAxis = np.cross(normal, xAxis)
+            yAxis /= np.linalg.norm(yAxis)
+
+            # --- Create a circle of 'disk_resolution' segments in local 2D coords ---
+            angles = np.linspace(0, 2*np.pi, disk_resolution, endpoint=False)
+            circle_points_3d = []
+            for theta in angles:
+                # local 2D circle: (r*cosθ, r*sinθ)
+                px = radius * np.cos(theta)
+                py = radius * np.sin(theta)
+
+                # transform to 3D world space: position + px*xAxis + py*yAxis
+                world_pt = np.array([x, y, z]) + px * xAxis + py * yAxis
+                circle_points_3d.append(world_pt)
+
+            # We have a list of [x, y, z]. For a filled polygon, Poly3DCollection
+            # wants them as a single Nx3 array.
+            circle_points_3d = np.array(circle_points_3d)
+            polygons.append(circle_points_3d)
+            polygon_colors.append(color)
+
+            # Collect positions and normals for quiver (if used)
+            positions.append([x, y, z])
+            normals.append(normal)
+
+        # --- Draw the disks as polygons ---
+        poly_collection = Poly3DCollection(
+            polygons,
+            facecolors=polygon_colors,
+            edgecolors='k',  # black edge
+            linewidths=0.5,
+            alpha=disk_alpha
+        )
+        ax.add_collection3d(poly_collection)
+
+        # --- Optionally draw normal vectors (quiver) ---
+        if draw_normals and len(positions) > 0:
+            X = [p[0] for p in positions]
+            Y = [p[1] for p in positions]
+            Z = [p[2] for p in positions]
+
+            Nx = [n[0] for n in normals]
+            Ny = [n[1] for n in normals]
+            Nz = [n[2] for n in normals]
+
+            # Note: If your scene is large, you may want to increase `length`.
+            ax.quiver(
+                X, Y, Z, 
+                Nx, Ny, Nz, 
+                length=normal_scale, 
+                color='red', 
+                normalize=True
+            )
+
+        # --- Axis labels, aspect ratio, etc. ---
+        ax.set_xlabel('X')
+        ax.set_ylabel('Y')
+        ax.set_zlabel('Z')
+        try:
+            ax.set_box_aspect((1, 1, 1))
+        except AttributeError:
+            pass  # older MPL versions
+
+        plt.title("Surfels as Disks (Oriented by Normal)")
+        plt.show()
+    
+
+
+def visualize_pointcloud(
+        points,
+        colors=None,
+        title='Point Cloud',
+        point_size=1,
+        alpha=1.0,
+        bg_color=(240/255, 223/255, 223/255)  # 新增参数，默认白色 (1,1,1)
+    ):
+    """
+    可视化3D点云，同时支持每个点的RGB或RGBA颜色，并保证x, y, z三个轴等比例缩放。
+    
+    参数
+    ----------
+    points : np.ndarray 或 torch.Tensor
+        形状为 [N, 3] 的数组或张量，每行表示一个3D点 (x, y, z)。
+    colors : None, str, 或 np.ndarray
+        - 如果为 None，则使用默认颜色 'blue'。
+        - 如果为字符串，则所有点均使用该颜色。
+        - 如果为数组，则形状应为 [N, 3] 或 [N, 4]，表示每个点的颜色，值的范围应为 [0, 1]（若为浮点数）。
+    title : str, 可选
+        图像标题，默认 'Point Cloud'。
+    point_size : float, 可选
+        点的大小，默认 1。
+    alpha : float, 可选
+        点的整体透明度，默认 1.0。
+    bg_color : tuple, 可选
+        背景颜色，格式为 (r, g, b)，每个值的范围为 [0, 1]，默认为白色 (1.0, 1.0, 1.0)。
+
+    示例
+    --------
+    >>> import numpy as np
+    >>> pts = np.random.rand(1000, 3)
+    >>> cols = np.random.rand(1000, 3)
+    >>> visualize_pointcloud(pts, colors=cols, title="随机点云", bg_color=(0.2, 0.2, 0.3))
+    """
+    
+    # 如果是 Torch 张量，则转换为 NumPy 数组
+    if isinstance(points, torch.Tensor):
+        points = points.detach().cpu().numpy()
+    if isinstance(colors, torch.Tensor):
+        colors = colors.detach().cpu().numpy()
+
+    # 如果点云或颜色数据维度过高，则展平
+    if len(points.shape) > 2:
+        points = points.reshape(-1, 3)
+    if colors is not None and isinstance(colors, np.ndarray) and len(colors.shape) > 2:
+        colors = colors.reshape(-1, colors.shape[-1])
+        
+    # 验证点云形状
+    if points.shape[1] != 3:
+        raise ValueError("`points` array must have shape [N, 3].")
+        
+    # 处理颜色参数
+    if colors is None:
+        colors = 'blue'
+    elif isinstance(colors, np.ndarray):
+        colors = np.asarray(colors)
+        if colors.shape[0] != points.shape[0]:
+            raise ValueError("Colors array length must match the number of points.")
+        if colors.shape[1] not in [3, 4]:
+            raise ValueError("Colors array must have shape [N, 3] or [N, 4].")
+    
+    # 验证背景颜色参数
+    if not isinstance(bg_color, tuple) or len(bg_color) != 3:
+        raise ValueError("Background color must be a tuple of (r, g, b) with values between 0 and 1.")
+    
+    # 提取坐标
+    x = points[:, 0]
+    y = points[:, 1]
+    z = points[:, 2]
+    
+    # 创建图像，并设置自定义背景颜色
+    fig = plt.figure(figsize=(8, 6), facecolor=bg_color)
+    ax = fig.add_subplot(111, projection='3d')
+    ax.set_facecolor(bg_color)
+    
+    # 绘制散点图
+    ax.scatter(x, y, z, c=colors, s=point_size, alpha=alpha)
+    
+    # 设置等比例缩放
+    max_range = np.array([x.max() - x.min(), 
+                          y.max() - y.min(),
+                          z.max() - z.min()]).max() / 2.0
+    mid_x = (x.max() + x.min()) * 0.5
+    mid_y = (y.max() + y.min()) * 0.5
+    mid_z = (z.max() + z.min()) * 0.5
+    ax.set_xlim(mid_x - max_range, mid_x + max_range)
+    ax.set_ylim(mid_y - max_range, mid_y + max_range)
+    ax.set_zlim(mid_z - max_range, mid_z + max_range)
+    
+    # 隐藏刻度和标签
+    ax.set_xticks([])
+    ax.set_yticks([])
+    ax.set_zticks([])
+    ax.set_xlabel('')
+    ax.set_ylabel('')
+    ax.set_zlabel('')
+    ax.grid(False)
+    
+    # 隐藏3D坐标轴的面板（pane）来去除轴的显示
+    ax.xaxis.pane.set_visible(False)
+    ax.yaxis.pane.set_visible(False)
+    ax.zaxis.pane.set_visible(False)
+    
+    # 设置标题（如果需要显示标题）
+    ax.set_title(title)
+    
+    plt.tight_layout()
+    plt.show()
+
+# def visualize_pointcloud(
+#         points,
+#         colors=None,
+#         title='Point Cloud',
+#         point_size=1,
+#         alpha=1.0
+#     ):
+#     """
+#     可视化3D点云，同时支持每个点的RGB或RGBA颜色，并保证x, y, z三个轴等比例缩放。
+    
+#     参数
+#     ----------
+#     points : np.ndarray 或 torch.Tensor
+#         形状为 [N, 3] 的数组或张量，每行表示一个3D点 (x, y, z)。
+#     colors : None, str, 或 np.ndarray
+#         - 如果为 None，则使用默认颜色 'blue'。
+#         - 如果为字符串，则所有点均使用该颜色。
+#         - 如果为数组，则形状应为 [N, 3] 或 [N, 4]，表示每个点的颜色，值的范围应为 [0, 1]（若为浮点数）。
+#     title : str, 可选
+#         图像标题，默认 'Point Cloud'。
+#     point_size : float, 可选
+#         点的大小，默认 1。
+#     alpha : float, 可选
+#         点的整体透明度，默认 1.0。
+
+#     示例
+#     --------
+#     >>> import numpy as np
+#     >>> pts = np.random.rand(1000, 3)
+#     >>> cols = np.random.rand(1000, 3)
+#     >>> visualize_pointcloud(pts, colors=cols, title="随机点云")
+#     """
+    
+#     # 如果是 Torch 张量，则转换为 NumPy 数组
+#     if isinstance(points, torch.Tensor):
+#         points = points.detach().cpu().numpy()
+#     if isinstance(colors, torch.Tensor):
+#         colors = colors.detach().cpu().numpy()
+
+#     # 如果点云或颜色数据维度过高，则展平
+#     if len(points.shape) > 2:
+#         points = points.reshape(-1, 3)
+#     if colors is not None and isinstance(colors, np.ndarray) and len(colors.shape) > 2:
+#         colors = colors.reshape(-1, colors.shape[-1])
+        
+#     # 验证点云形状
+#     if points.shape[1] != 3:
+#         raise ValueError("`points` array must have shape [N, 3].")
+        
+#     # 处理颜色参数
+#     if colors is None:
+#         colors = 'blue'
+#     elif isinstance(colors, np.ndarray):
+#         colors = np.asarray(colors)
+#         if colors.shape[0] != points.shape[0]:
+#             raise ValueError("Colors array length must match the number of points.")
+#         if colors.shape[1] not in [3, 4]:
+#             raise ValueError("Colors array must have shape [N, 3] or [N, 4].")
+    
+#     # 提取坐标
+#     x = points[:, 0]
+#     y = points[:, 1]
+#     z = points[:, 2]
+    
+#     # 创建图像，并设置背景为白色
+#     fig = plt.figure(figsize=(8, 6), facecolor='white')
+#     ax = fig.add_subplot(111, projection='3d')
+#     ax.set_facecolor('white')
+    
+#     # 绘制散点图
+#     ax.scatter(x, y, z, c=colors, s=point_size, alpha=alpha)
+    
+#     # 设置等比例缩放
+#     max_range = np.array([x.max() - x.min(), 
+#                           y.max() - y.min(),
+#                           z.max() - z.min()]).max() / 2.0
+#     mid_x = (x.max() + x.min()) * 0.5
+#     mid_y = (y.max() + y.min()) * 0.5
+#     mid_z = (z.max() + z.min()) * 0.5
+#     ax.set_xlim(mid_x - max_range, mid_x + max_range)
+#     ax.set_ylim(mid_y - max_range, mid_y + max_range)
+#     ax.set_zlim(mid_z - max_range, mid_z + max_range)
+    
+#     # 隐藏刻度和标签
+#     ax.set_xticks([])
+#     ax.set_yticks([])
+#     ax.set_zticks([])
+#     ax.set_xlabel('')
+#     ax.set_ylabel('')
+#     ax.set_zlabel('')
+#     ax.grid(False)
+    
+#     # 隐藏3D坐标轴的面板（pane）来去除轴的显示
+#     ax.xaxis.pane.set_visible(False)
+#     ax.yaxis.pane.set_visible(False)
+#     ax.zaxis.pane.set_visible(False)
+    
+#     # 设置标题（如果需要显示标题）
+#     ax.set_title(title)
+    
+#     plt.tight_layout()
+#     plt.show()
+class Surfel:
+    def __init__(self, position, normal, radius=1.0, color=None):
+        """
+        position: (x, y, z)
+        normal:   (nx, ny, nz)
+        radius:   scalar
+        color:    (r, g, b) or None
+        """
+        self.position = position
+        self.normal = normal
+        self.radius = radius
+        self.color = color
+
+    def __repr__(self):
+        return (f"Surfel(position={self.position}, "
+                f"normal={self.normal}, radius={self.radius}, "
+                f"color={self.color})")
+
+
+
+
+class Octree:
+    def __init__(self, points, indices=None, bbox=None, max_points=10):
+        """
+        构建八叉树：
+          - points: 所有点的 numpy 数组，形状为 (N, 3)
+          - indices: 当前节点中点的索引列表
+          - bbox: 当前节点的包围盒，形式为 (center, half_size)，其中半径为正方体半边长
+          - max_points: 叶子节点允许的最大点数
+        """
+        self.points = points
+        if indices is None:
+            indices = np.arange(points.shape[0])
+        self.indices = indices
+
+        # 如果没有给定包围盒，则计算所有点的包围盒，保证是一个正方体
+        if bbox is None:
+            min_bound = points.min(axis=0)
+            max_bound = points.max(axis=0)
+            center = (min_bound + max_bound) / 2
+            half_size = np.max(max_bound - min_bound) / 2
+            bbox = (center, half_size)
+        self.center, self.half_size = bbox
+
+        self.children = []  # 存储子节点
+        self.max_points = max_points
+
+        if len(self.indices) > self.max_points:
+            self.subdivide()
+
+    def subdivide(self):
+        """将当前节点划分为8个子节点"""
+        cx, cy, cz = self.center
+        hs = self.half_size / 2
+        # 八个象限的偏移量
+        offsets = np.array([[dx, dy, dz] for dx in (-hs, hs) 
+                                       for dy in (-hs, hs) 
+                                       for dz in (-hs, hs)])
+        for offset in offsets:
+            child_center = self.center + offset
+            child_indices = []
+            # 检查每个点是否在子节点的包围盒内
+            for idx in self.indices:
+                p = self.points[idx]
+                if np.all(np.abs(p - child_center) <= hs):
+                    child_indices.append(idx)
+            child_indices = np.array(child_indices)
+            if len(child_indices) > 0:
+                child = Octree(self.points, indices=child_indices, bbox=(child_center, hs), max_points=self.max_points)
+                self.children.append(child)
+        # 划分后，内部节点不再直接保存点索引
+        self.indices = None
+
+    def sphere_intersects_node(self, center, r):
+        """
+        判断以center为球心, r为半径的球是否与当前节点的轴对齐包围盒相交。
+        算法：计算球心到盒子的距离（只考虑超出盒子边界的部分），若小于r，则相交。
+        """
+        diff = np.abs(center - self.center)
+        max_diff = diff - self.half_size
+        max_diff = np.maximum(max_diff, 0)
+        dist_sq = np.sum(max_diff**2)
+        return dist_sq <= r*r
+
+    def query_ball_point(self, point, r):
+        """
+        查询距离给定点 point 小于 r 的所有点索引。
+        """
+        results = []
+        if not self.sphere_intersects_node(point, r):
+            return results
+        # 如果当前节点没有子节点，则为叶子节点
+        if len(self.children) == 0:
+            if self.indices is not None:
+                for idx in self.indices:
+                    if np.linalg.norm(self.points[idx] - point) <= r:
+                        results.append(idx)
+            return results
+        else:
+            for child in self.children:
+                results.extend(child.query_ball_point(point, r))
+            return results
+        
+
+def estimate_normal_from_pointmap(pointmap: torch.Tensor) -> torch.Tensor:
+    """
+    Estimate surface normals from a 3D point map by computing cross products of
+    neighboring points, using PyTorch tensors.
+
+    Parameters
+    ----------
+    pointmap : torch.Tensor
+        A PyTorch tensor of shape [H, W, 3] containing 3D points in camera coordinates.
+        Each point is represented as (X, Y, Z). This tensor can be on CPU or GPU.
+
+    Returns
+    -------
+    torch.Tensor
+        A PyTorch tensor of shape [H, W, 3] containing estimated surface normals.
+        Each normal is a unit vector (X, Y, Z).
+        Points where normals cannot be computed (e.g. boundaries) will be zero vectors.
+    """
+    # pointmap is shape (H, W, 3)
+    h, w = pointmap.shape[:2]
+    device = pointmap.device  # Keep the device (CPU/GPU) consistent
+    dtype = pointmap.dtype
+    
+    # Initialize the normal map
+    normal_map = torch.zeros((h, w, 3), device=device, dtype=dtype)
+    
+    for y in range(h):
+        for x in range(w):
+            # Check if neighbors are within bounds
+            if x+1 >= w or y+1 >= h:
+                continue
+            
+            p_center = pointmap[y, x]
+            p_right  = pointmap[y, x+1]
+            p_down   = pointmap[y+1, x]
+            
+            # Compute vectors
+            v1 = p_right - p_center
+            v2 = p_down - p_center
+            
+            v1 = v1 / torch.linalg.norm(v1)
+            v2 = v2 / torch.linalg.norm(v2)
+            
+            # Cross product in camera coordinates
+            n_c = torch.cross(v1, v2)
+            # n_c *= 1e10
+            
+            # Compute norm of the normal vector
+            norm_len = torch.linalg.norm(n_c)
+            
+            if norm_len < 1e-8:
+                continue
+            
+            # Normalize and store
+            normal_map[y, x] = n_c / norm_len
+    
+    return normal_map
+
+
+def load_multiple_images(image_names, image_size=512, dtype=torch.float32):
+    images = load_images(image_names, size=image_size, force_1024=True, dtype=dtype)
+    img_ori = (images[0]['img_ori'].squeeze(0).permute(1,2,0)+1.)/2. # Just for reference
+    return images, img_ori
+
+
+def load_initial_images(image_name):
+    images = load_images([image_name], size=512, force_1024=True)
+    img_ori = (images[0]['img_ori'].squeeze(0).permute(1,2,0)+1.)/2. # [H, W, 3], range [0,1]
+    if len(images) == 1:
+        images = [images[0], copy.deepcopy(images[0])]
+        images[1]['idx'] = 1
+    return images, img_ori
+
+
+def merge_surfels(
+    new_surfels: list,
+    current_timestamp: str,
+    existing_surfels: list,
+    existing_surfel_to_timestamp: dict,
+    position_threshold: float = 0.025,
+    normal_threshold: float = 0.7,
+    max_points_per_node: int = 10  # 八叉树叶子节点允许的最大点数
+):
+    """
+    将新的 surfel 合并到已有 surfel 列表中，使用八叉树来加速空间查找。
+    
+    Args:
+        new_surfels (list[Surfel]): 待合并的新 surfel 列表。
+        current_timestamp (str): 当前的时间戳。
+        existing_surfels (list[Surfel]): 已存在的 surfel 列表。
+        existing_surfel_to_timestamp (dict): 每个 surfel 索引到时间戳的映射。
+        position_threshold (float): 判断两个 surfel 空间距离是否足够近的阈值。
+        normal_threshold (float): 判断两个 surfel 法向是否对齐的阈值。
+        max_points_per_node (int): 构建八叉树时，每个叶子节点最大允许的点数。
+        
+    Returns:
+        (list[Surfel], dict): 
+            - 未能匹配的 surfel 列表，需要追加到已有 surfel 列表中。
+            - 更新后的 existing_surfel_to_timestamp 映射。
+    """
+    # 安全检查
+    assert len(existing_surfels) == len(existing_surfel_to_timestamp), (
+        "existing_surfels 和 existing_surfel_to_timestamp 长度不匹配。"
+    )
+    
+    # 构造已有 surfel 的位置和法向数组
+    positions = np.array([s.position for s in existing_surfels])  # Shape: (N, 3)
+    normals = np.array([s.normal for s in existing_surfels])      # Shape: (N, 3)
+    
+    
+    # 用于存储未匹配到已有 surfel 的新 surfel
+    filtered_surfels = []
+    
+    merge_count = 0
+    for new_surfel in new_surfels:
+        is_merged = False
+        for idx in range(len(positions)):
+            if np.linalg.norm(positions[idx] - new_surfel.position) < position_threshold:
+                if np.dot(normals[idx], new_surfel.normal) > normal_threshold:
+                    existing_surfel_to_timestamp[idx].append(current_timestamp)
+                    is_merged = True
+                    merge_count += 1
+                    break
+        
+        
+        if not is_merged:
+            filtered_surfels.append(new_surfel)
+    
+    # 返回未匹配的 surfel 列表及更新后的时间戳映射
+    print(f"merge_count: {merge_count}")
+    return filtered_surfels, existing_surfel_to_timestamp
+
+def pointmap_to_surfels(pointmap: torch.Tensor,
+                        focal_lengths: torch.Tensor,
+                        depth_map: torch.Tensor,
+                        poses: torch.Tensor, # shape: (4, 4)
+                        radius_scale: float = 0.5,
+                        depth_threshold: float = 1.0,
+                        estimate_normals: bool = True):
+    surfels = []
+    if len(focal_lengths) == 2:
+        focal_lengths = torch.mean(focal_lengths, dim=0)
+    H, W = pointmap.shape[:2]
+    # 1) Estimate normals
+    if estimate_normals:
+        normal_map = estimate_normal_from_pointmap(pointmap)
+    else:
+        normal_map = torch.zeros_like(pointmap)
+    depth_remove_count = 0
+    for v in range(H-1):
+        for u in range(W-1):
+            if depth_map[v, u] > depth_threshold:
+                depth_remove_count += 1
+                continue
+            position = pointmap[v, u].detach().cpu().numpy() # in global coords
+            normal = normal_map[v, u].detach().cpu().numpy() # in global coords
+            depth = depth_map[v, u].detach().cpu().numpy() # in local coords
+            view_direction = position - poses[0:3, 3].detach().cpu().numpy()
+            view_direction = view_direction / np.linalg.norm(view_direction)
+            if np.dot(view_direction, normal) < 0:
+                normal = -normal
+            adjustment_value = 0.2 + 0.8 * np.abs(np.dot(view_direction, normal))
+            radius = (radius_scale * depth/focal_lengths/adjustment_value).detach().cpu().numpy()
+            surfels.append(Surfel(position, normal, radius))
+    print(f"depth_remove_count: {depth_remove_count}")
+    return surfels
+
+
+
+def run_dust3r(input_images,
+               dust3r,
+               batch_size = 1,
+               niter = 1000,
+               lr = 0.01,
+               schedule = 'linear',
+               clean_pc = False,
+               focal_lengths = None,
+               poses = None,
+               device = 'cuda',
+               background_mask = None,
+               use_amp = False  # <<< AMP CHANGE: add a flag to enable/disable AMP
+               ):
+
+    # We wrap the entire inference and alignment in autocast so that
+    # forward passes and any internal backward passes happen in mixed precision.
+    with autocast(device_type='cuda', dtype=torch.float16, enabled=use_amp):
+        pairs = make_pairs(input_images, scene_graph='complete', prefilter=None, symmetrize=True)
+        output = inference(pairs, dust3r, device, batch_size=batch_size)
+
+        mode = GlobalAlignerMode.PointCloudDifferentFocalOptimizer 
+        scene = global_aligner(output, device=device, mode=mode)
+        if focal_lengths is not None:
+            scene.preset_focal(focal_lengths)
+        if poses is not None:
+            scene.preset_pose(poses)
+        if mode == GlobalAlignerMode.PointCloudDifferentFocalOptimizer:
+            # Depending on how dust3r internally does optimization,
+            # it may or may not require gradient scaling.
+            # If you need it, you can do something more manual with GradScaler.
+            loss = scene.compute_global_alignment(init='mst', niter=niter, schedule=schedule, lr=lr)
+        else:
+            loss = None
+
+        # If you want to clean up the pointcloud after alignment
+        if clean_pc:
+            scene = scene.clean_pointcloud()
+
+    return scene, loss
+
+
+if __name__ == "__main__":
+    load_image_size = 512
+    load_dtype = torch.float16
+    device = 'cuda'
+    model_path = "checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth"
+    selected_frame_paths = ["assets/jesus/jesus_0.jpg",
+                            "assets/jesus/jesus_1.jpg",
+                            "assets/jesus/jesus_2.jpg"
+                            ]
+    
+    # pil_image = Image.open("./assets/radcliffe_camera_bg.png").resize((512, 288))
+    # r, g, b, a = pil_image.split()
+    # background_mask = a
+    # background_mask = (1 - torch.tensor(np.array(background_mask))).unsqueeze(0).repeat(2, 1, 1).bool()
+    
+    all_surfels = []
+    surfel_to_timestamp = {}
+    
+    dust3r = load_model(model_path, device=device)
+    dust3r.eval()
+    dust3r = dust3r.to(device)
+    dust3r = dust3r.half()
+    if len(selected_frame_paths) == 1:
+        selected_frame_paths = selected_frame_paths * 2
+    frame_images, frame_img_ori = load_multiple_images(selected_frame_paths, 
+                                                       image_size=load_image_size, 
+                                                       dtype=load_dtype)
+    
+    
+    scene, loss = run_dust3r(frame_images, dust3r, device=device, use_amp=True)
+    # --- 1) Extract outputs ---
+    # pointcloud shape: [N, H, W, 3]
+    shrink_factor = 0.15
+    pointcloud = torch.stack(scene.get_pts3d())
+    # poses shape: [N, 4, 4]
+    # optimized_poses = scene.get_im_poses()
+    # focal_lengths shape: [N]
+    focal_lengths = scene.get_focals()
+    
+
+    # adjustion_transformation_matrix = SpatialConstructor.estimate_pose_alignment(optimized_poses, original_camera_poses) # optimized_poses -> original_camera_poses matrix
+    # adjusted_optimized_poses = adjustion_transformation_matrix @ optimized_poses
+
+
+    # --- 2) Resize pointcloud ---
+    # Permute for resizing -> [N, 3, H, W]
+    pointcloud = pointcloud.permute(0, 3, 1, 2)
+
+    # Resize using bilinear interpolation
+    pointcloud = F.interpolate(
+        pointcloud, 
+        scale_factor=shrink_factor, 
+        mode='bilinear'
+    )
+    # Permute back -> [N, H', W', 3]
+    pointcloud = pointcloud.permute(0, 2, 3, 1)[-1:]
+    # transform pointcloud
+    # pointcloud = torch.stack([SpatialConstructor.transform_pointmap(pointcloud[i], adjustion_transformation_matrix) for i in range(pointcloud.shape[0])])
+
+
+
+    rgbs = scene.imgs
+    rgbs = torch.tensor(np.array(rgbs))
+    rgbs = rgbs.permute(0, 3, 1, 2)
+    rgbs = F.interpolate(rgbs, scale_factor=shrink_factor, mode='bilinear')
+    rgbs = rgbs.permute(0, 2, 3, 1)[-1:]
+    visualize_pointcloud(pointcloud, rgbs, point_size=4)
+    # --- 3) Resize depth map ---
+    # depth_map shape: [N, H, W]
+    depth_map = torch.stack(scene.get_depthmaps())
+
+    # Add channel dimension -> [N, 1, H, W]
+    depth_map = depth_map.unsqueeze(1)
+
+    depth_map = F.interpolate(
+        depth_map, 
+        scale_factor=shrink_factor, 
+        mode='bilinear'
+    )
+    
+    poses = scene.get_im_poses()[-1:]
+
+    # Remove channel dimension -> [N, H', W']
+    depth_map = depth_map.squeeze(1)[-1:]
+
+    for frame_idx in range(len(pointcloud)):
+        # if frame_idx > 1:
+        #     break
+    # Create surfels for the current frame
+        surfels = pointmap_to_surfels(
+            pointmap=pointcloud[frame_idx],
+            focal_lengths=focal_lengths[frame_idx] * shrink_factor,
+            depth_map=depth_map[frame_idx],
+            poses=poses[frame_idx],
+            estimate_normals=True,
+            radius_scale=0.5,
+            depth_threshold=0.48
+        )
+
+        # Merge with existing surfels if not the first frame
+        if frame_idx > 0:
+            surfels, surfel_to_timestamp = merge_surfels(
+                new_surfels=surfels,
+                current_timestamp=frame_idx,
+                existing_surfels=all_surfels,
+                existing_surfel_to_timestamp=surfel_to_timestamp,
+                position_threshold=0.01,
+                normal_threshold=0.7
+                    )
+        
+                    # Update timestamp mapping
+        num_surfels = len(surfels)
+        surfel_start_index = len(all_surfels)
+        for surfel_index in range(num_surfels):
+            # Each newly created surfel gets mapped to this frame index
+            # surfel_to_timestamp[surfel_start_index + surfel_index] = [frame_idx]
+            surfel_to_timestamp[surfel_start_index + surfel_index] = [2]
+        all_surfels.extend(surfels)
+    
+    positions = np.array([s.position for s in all_surfels], dtype=np.float32)
+    normals   = np.array([s.normal   for s in all_surfels], dtype=np.float32)
+    radii     = np.array([s.radius   for s in all_surfels], dtype=np.float32)
+    colors    = np.array([s.color    for s in all_surfels], dtype=np.float32)
+    
+    
+    visualize_surfels(all_surfels)
+    
+
+    # np.savez(f"./surfels_added_first2.npz",
+    #         positions=positions,
+    #         normals=normals,
+    #         radii=radii,
+    #         colors=colors)
+    # with open("surfel_to_timestamp_first2.json", "w") as f:
+    #     json.dump(surfel_to_timestamp, f)
+    
+    
+    np.savez(f"./surfels_added_only3.npz",
+            positions=positions,
+            normals=normals,
+            radii=radii,
+            colors=colors)
+    with open("surfel_to_timestamp_only3.json", "w") as f:
+        json.dump(surfel_to_timestamp, f)
+    
+    stop = 1
\ No newline at end of file