Spaces:

buxiangzhiren
/

GeoRemover

Running on Zero

App Files Files Community

zixinz commited on Oct 3

Commit

5a0778e

1 Parent(s): f1483c5

Add application file

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +56 -0
code_depth/LICENSE +201 -0
code_depth/README.md +120 -0
code_depth/app.py +152 -0
code_depth/assets/example_videos/Tokyo-Walk_rgb.mp4 +3 -0
code_depth/assets/example_videos/davis_rollercoaster.mp4 +3 -0
code_depth/assets/teaser_video_v2.png +3 -0
code_depth/benchmark/README.md +34 -0
code_depth/benchmark/__init__.py +0 -0
code_depth/benchmark/dataset_extract/dataset_extract_bonn.py +86 -0
code_depth/benchmark/dataset_extract/dataset_extract_kitti.py +84 -0
code_depth/benchmark/dataset_extract/dataset_extract_nyuv2.py +76 -0
code_depth/benchmark/dataset_extract/dataset_extract_scannet.py +124 -0
code_depth/benchmark/dataset_extract/dataset_extract_sintel.py +110 -0
code_depth/benchmark/dataset_extract/eval_utils.py +140 -0
code_depth/benchmark/eval/eval.py +265 -0
code_depth/benchmark/eval/eval.sh +30 -0
code_depth/benchmark/eval/eval_500.sh +30 -0
code_depth/benchmark/eval/eval_tae.py +295 -0
code_depth/benchmark/eval/eval_tae.sh +18 -0
code_depth/benchmark/eval/metric.py +117 -0
code_depth/benchmark/infer/infer.py +65 -0
code_depth/get_weights.sh +6 -0
code_depth/large_files.txt +2 -0
code_depth/requirements.txt +14 -0
code_depth/run.py +81 -0
code_depth/run_images_rord.py +112 -0
code_depth/run_single_image.py +69 -0
code_depth/utils/dc_utils.py +86 -0
code_depth/utils/util.py +74 -0
code_depth/video_depth_anything/dinov2.py +415 -0
code_depth/video_depth_anything/dinov2_layers/__init__.py +11 -0
code_depth/video_depth_anything/dinov2_layers/attention.py +83 -0
code_depth/video_depth_anything/dinov2_layers/block.py +252 -0
code_depth/video_depth_anything/dinov2_layers/drop_path.py +35 -0
code_depth/video_depth_anything/dinov2_layers/layer_scale.py +28 -0
code_depth/video_depth_anything/dinov2_layers/mlp.py +41 -0
code_depth/video_depth_anything/dinov2_layers/patch_embed.py +89 -0
code_depth/video_depth_anything/dinov2_layers/swiglu_ffn.py +63 -0
code_depth/video_depth_anything/dpt.py +160 -0
code_depth/video_depth_anything/dpt_temporal.py +114 -0
code_depth/video_depth_anything/motion_module/attention.py +429 -0
code_depth/video_depth_anything/motion_module/motion_module.py +297 -0
code_depth/video_depth_anything/util/blocks.py +162 -0
code_depth/video_depth_anything/util/transform.py +158 -0
code_depth/video_depth_anything/video_depth.py +156 -0
code_edit/.gradio/certificate.pem +31 -0
code_edit/Flux_fill_d2i.py +53 -0
code_edit/Flux_fill_infer_depth.py +64 -0
code_edit/README.md +93 -0

app.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+import pathlib
+import subprocess
+import gradio as gr
+import spaces
+import torch
+# ---------- 权重下载：强制在 code_depth 下执行你的脚本 ----------
+BASE_DIR = pathlib.Path(__file__).resolve().parent
+SCRIPT_DIR = BASE_DIR / "code_depth"
+GET_WEIGHTS_SH = SCRIPT_DIR / "get_weights.sh"
+def ensure_executable(path: pathlib.Path):
+    if not path.exists():
+        raise FileNotFoundError(f"Download script not found: {path}")
+    os.chmod(path, os.stat(path).st_mode | 0o111)
+def ensure_weights() -> str:
+    """
+    在 code_depth 目录下运行 get_weights.sh。
+    该脚本会在 code_depth/ 下创建 checkpoints/ 并下载权重。
+    返回绝对路径：<repo_root>/code_depth/checkpoints
+    """
+    ensure_executable(GET_WEIGHTS_SH)
+    # 你脚本的工作目录需要是 code_depth
+    subprocess.run(
+        ["bash", str(GET_WEIGHTS_SH)],
+        check=True,
+        cwd=str(SCRIPT_DIR),
+        env={**os.environ, "HF_HUB_DISABLE_TELEMETRY": "1"},
+    )
+    ckpt_dir = SCRIPT_DIR / "checkpoints"
+    return str(ckpt_dir)
+# 启动时先拉权重（不开 Persistent Storage 时，重建环境会清空；重启后会自动再拉一次）
+try:
+    CKPT_DIR = ensure_weights()
+    print(f"✅ Weights ready in: {CKPT_DIR}")
+except Exception as e:
+    print(f"⚠️ Failed to prepare weights: {e}")
+    CKPT_DIR = str(SCRIPT_DIR / "checkpoints")  # 仍然给个路径，后续可检查是否存在
+# ---------- Gradio 推理函数 ----------
+@spaces.GPU
+def greet(n: float):
+    # 在 GPU worker 里拿 device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    zero = torch.tensor([0.0], device=device)
+    # 仅示例输出，你可以在这里用 CKPT_DIR 加载你的模型
+    print(f"Device in greet(): {device}")
+    print(f"Using checkpoints from: {CKPT_DIR}")
+    return f"Hello {(zero + n).item()} Tensor (device={device})"
+demo = gr.Interface(fn=greet, inputs=gr.Number(label="n"), outputs=gr.Text())
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

code_depth/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

code_depth/README.md ADDED Viewed

	@@ -0,0 +1,120 @@

+<div align="center">
+<h1>Video Depth Anything</h1>
+[**Sili Chen**](https://github.com/SiliChen321) · [**Hengkai Guo**](https://guohengkai.github.io/)<sup>&dagger;</sup> · [**Shengnan Zhu**](https://github.com/Shengnan-Zhu)  · [**Feihu Zhang**](https://github.com/zhizunhu)
+<br>
+[**Zilong Huang**](http://speedinghzl.github.io/)   ·  [**Jiashi Feng**](https://scholar.google.com.sg/citations?user=Q8iay0gAAAAJ&hl=en)   ·  [**Bingyi Kang**](https://bingykang.github.io/)<sup>&dagger;</sup>
+<br>
+ByteDance
+<br>
+&dagger;Corresponding author
+<a href="https://arxiv.org/abs/2501.12375"><img src='https://img.shields.io/badge/arXiv-Video Depth Anything-red' alt='Paper PDF'></a>
+<a href='https://videodepthanything.github.io'><img src='https://img.shields.io/badge/Project_Page-Video Depth Anything-green' alt='Project Page'></a>
+<a href='https://huggingface.co/spaces/depth-anything/Video-Depth-Anything'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-blue'></a>
+</div>
+</div>
+This work presents **Video Depth Anything** based on [Depth Anything V2](https://github.com/DepthAnything/Depth-Anything-V2), which can be applied to arbitrarily long videos without compromising quality, consistency, or generalization ability. Compared with other diffusion-based models, it enjoys faster inference speed, fewer parameters, and higher consistent depth accuracy.
+![teaser](assets/teaser_video_v2.png)
+## News
+- **2025-03-11:** Add full dataset inference and evaluation scripts.
+- **2025-02-08:** Enable autocast inference. Support grayscale video, NPZ and EXR output formats.
+- **2025-01-21:** Paper, project page, code, models, and demo are all released.
+## Release Notes
+- **2025-02-08:** 🚀🚀🚀 Inference speed and memory usage improvement
+  <table>
+    <thead>
+      <tr>
+        <th rowspan="2" style="text-align: center;">Model</th>
+        <th colspan="2">Latency (ms)</th>
+        <th colspan="2">GPU VRAM (GB)</th>
+      </tr>
+      <tr>
+        <th>FP32</th>
+        <th>FP16</th>
+        <th>FP32</th>
+        <th>FP16</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>Video-Depth-Anything-V2-Small</td>
+        <td>9.1</td>
+        <td><strong>7.5</strong></td>
+        <td>7.3</td>
+        <td><strong>6.8</strong></td>
+      </tr>
+      <tr>
+        <td>Video-Depth-Anything-V2-Large</td>
+        <td>67</td>
+        <td><strong>14</strong></td>
+        <td>26.7</td>
+        <td><strong>23.6</strong></td>
+    </tbody>
+  </table>
+  The Latency and GPU VRAM results are obtained on a single A100 GPU with input of shape 1 x 32 x 518 × 518.
+## Pre-trained Models
+We provide **two models** of varying scales for robust and consistent video depth estimation:
+| Model | Params | Checkpoint |
+|:-|-:|:-:|
+| Video-Depth-Anything-V2-Small | 28.4M | [Download](https://huggingface.co/depth-anything/Video-Depth-Anything-Small/resolve/main/video_depth_anything_vits.pth?download=true) |
+| Video-Depth-Anything-V2-Large | 381.8M | [Download](https://huggingface.co/depth-anything/Video-Depth-Anything-Large/resolve/main/video_depth_anything_vitl.pth?download=true) |
+## Usage
+### Preparation
+```bash
+git clone https://github.com/DepthAnything/Video-Depth-Anything
+cd Video-Depth-Anything
+pip install -r requirements.txt
+```
+Download the checkpoints listed [here](#pre-trained-models) and put them under the `checkpoints` directory.
+```bash
+bash get_weights.sh
+```
+### Inference a video
+```bash
+python3 run.py --input_video ./assets/example_videos/davis_rollercoaster.mp4 --output_dir ./outputs --encoder vitl
+```
+Options:
+- `--input_video`: path of input video
+- `--output_dir`: path to save the output results
+- `--input_size` (optional): By default, we use input size `518` for model inference.
+- `--max_res` (optional): By default, we use maximum resolution `1280` for model inference.
+- `--encoder` (optional): `vits` for Video-Depth-Anything-V2-Small, `vitl` for Video-Depth-Anything-V2-Large.
+- `--max_len` (optional): maximum length of the input video, `-1` means no limit
+- `--target_fps` (optional): target fps of the input video, `-1` means the original fps
+- `--fp32` (optional): Use `fp32` precision for inference. By default, we use `fp16`.
+- `--grayscale` (optional): Save the grayscale depth map, without applying color palette.
+- `--save_npz` (optional): Save the depth map in `npz` format.
+- `--save_exr` (optional): Save the depth map in `exr` format.
+## Citation
+If you find this project useful, please consider citing:
+```bibtex
+@article{video_depth_anything,
+  title={Video Depth Anything: Consistent Depth Estimation for Super-Long Videos},
+  author={Chen, Sili and Guo, Hengkai and Zhu, Shengnan and Zhang, Feihu and Huang, Zilong and Feng, Jiashi and Kang, Bingyi}
+  journal={arXiv:2501.12375},
+  year={2025}
+}
+```
+## LICENSE
+Video-Depth-Anything-Small model is under the Apache-2.0 license. Video-Depth-Anything-Large model is under the CC-BY-NC-4.0 license. For business cooperation, please send an email to Hengkai Guo at [email protected].

code_depth/app.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Copyright (2025) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gradio as gr
+import numpy as np
+import os
+import torch
+from video_depth_anything.video_depth import VideoDepthAnything
+from utils.dc_utils import read_video_frames, save_video
+examples = [
+    ['assets/example_videos/davis_rollercoaster.mp4', -1, -1, 1280],
+]
+model_configs = {
+    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+}
+encoder='vitl'
+video_depth_anything = VideoDepthAnything(**model_configs[encoder])
+video_depth_anything.load_state_dict(torch.load(f'./checkpoints/video_depth_anything_{encoder}.pth', map_location='cpu'), strict=True)
+video_depth_anything = video_depth_anything.to('cuda').eval()
+def infer_video_depth(
+    input_video: str,
+    max_len: int = -1,
+    target_fps: int = -1,
+    max_res: int = 1280,
+    output_dir: str = './outputs',
+    input_size: int = 518,
+):
+    frames, target_fps = read_video_frames(input_video, max_len, target_fps, max_res)
+    depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=input_size, device='cuda')
+    video_name = os.path.basename(input_video)
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    processed_video_path = os.path.join(output_dir, os.path.splitext(video_name)[0]+'_src.mp4')
+    depth_vis_path = os.path.join(output_dir, os.path.splitext(video_name)[0]+'_vis.mp4')
+    save_video(frames, processed_video_path, fps=fps)
+    save_video(depths, depth_vis_path, fps=fps, is_depths=True)
+    return [processed_video_path, depth_vis_path]
+def construct_demo():
+    with gr.Blocks(analytics_enabled=False) as demo:
+        gr.Markdown(
+            f"""
+            blablabla
+            """
+        )
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                input_video = gr.Video(label="Input Video")
+            # with gr.Tab(label="Output"):
+            with gr.Column(scale=2):
+                with gr.Row(equal_height=True):
+                    processed_video = gr.Video(
+                        label="Preprocessed video",
+                        interactive=False,
+                        autoplay=True,
+                        loop=True,
+                        show_share_button=True,
+                        scale=5,
+                    )
+                    depth_vis_video = gr.Video(
+                        label="Generated Depth Video",
+                        interactive=False,
+                        autoplay=True,
+                        loop=True,
+                        show_share_button=True,
+                        scale=5,
+                    )
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                with gr.Row(equal_height=False):
+                    with gr.Accordion("Advanced Settings", open=False):
+                        max_len = gr.Slider(
+                            label="max process length",
+                            minimum=-1,
+                            maximum=1000,
+                            value=-1,
+                            step=1,
+                        )
+                        target_fps = gr.Slider(
+                            label="target FPS",
+                            minimum=-1,
+                            maximum=30,
+                            value=15,
+                            step=1,
+                        )
+                        max_res = gr.Slider(
+                            label="max side resolution",
+                            minimum=480,
+                            maximum=1920,
+                            value=1280,
+                            step=1,
+                        )
+                    generate_btn = gr.Button("Generate")
+            with gr.Column(scale=2):
+                pass
+        gr.Examples(
+            examples=examples,
+            inputs=[
+                input_video,
+                max_len,
+                target_fps,
+                max_res
+            ],
+            outputs=[processed_video, depth_vis_video],
+            fn=infer_video_depth,
+            cache_examples="lazy",
+        )
+        generate_btn.click(
+            fn=infer_video_depth,
+            inputs=[
+                input_video,
+                max_len,
+                target_fps,
+                max_res
+            ],
+            outputs=[processed_video, depth_vis_video],
+        )
+    return demo
+if __name__ == "__main__":
+    demo = construct_demo()
+    demo.queue()
+    demo.launch(server_name="0.0.0.0")

code_depth/assets/example_videos/Tokyo-Walk_rgb.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:097f16c33dd8c8d1d2a24d9ea31a90b76bd0ee324b958a47385183e3547a63a8
+size 2251450

code_depth/assets/example_videos/davis_rollercoaster.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7268cbecd9806a1e90a416de50dc02e50b4ae01428d5971837cf679dd0c87cb8
+size 1809560

code_depth/assets/teaser_video_v2.png ADDED Viewed

Git LFS Details

SHA256: 7ab2bf5f739de9d00adafe15ac4225143b59e208b8f79af7dc22c417c3a4584f
Pointer size: 132 Bytes
Size of remote file: 3.8 MB

code_depth/benchmark/README.md ADDED Viewed

	@@ -0,0 +1,34 @@

+# BENCHMARK
+## Prepare Dataset
+Download datasets from the following links:
+[sintel](http://sintel.is.tue.mpg.de/) [kitti](https://www.cvlibs.net/datasets/kitti/) [bonn](https://www.ipb.uni-bonn.de/data/rgbd-dynamic-dataset/index.html) [scannet](http://www.scan-net.org/) [nyuv2](https://cs.nyu.edu/~fergus/datasets/nyu_depth_v2.html)
+```bash
+pip3 install natsort
+cd benchmark/dataset_extract
+python3 dataset_extrtact${dataset}.py
+```
+This script will extract the dataset to the `benchmark/dataset_extract/dataset` folder. It will also generate the json file for the dataset.
+## Run inference
+```bash
+python3 benchmark/infer/infer.py \
+    --infer_path ${out_path} \
+    --json_file ${json_path} \
+    --datasets ${dataset}
+```
+Options:
+- `--infer_path`: path to save the output results
+- `--json_file`: path to the json file for the dataset
+- `--datasets`: dataset name, choose from `sintel`, `kitti`, `bonn`, `scannet`, `nyuv2`
+## Run evaluation
+```bash
+## tae
+bash benchmark/eval/eval_tae.sh ${out_path} benchmark/dataset_extract/dataset
+## ~110frame like DepthCrafter
+bash benchmark/eval/eval.sh ${out_path} benchmark/dataset_extract/dataset
+## ~500frame
+bash benchmark/eval/eval_500.sh ${out_path} benchmark/dataset_extract/dataset
+```

code_depth/benchmark/__init__.py ADDED Viewed

File without changes

code_depth/benchmark/dataset_extract/dataset_extract_bonn.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import numpy as np
+import os.path as osp
+from PIL import Image
+from tqdm import tqdm
+import cv2
+import csv
+import json
+import glob
+import shutil
+from natsort import natsorted
+from eval_utils import gen_json, get_sorted_files, even_or_odd, copy_crop_files
+def extract_bonn(
+    root,
+    depth_root,
+    saved_dir,
+    sample_len,
+    datatset_name,
+):
+    scenes_names = os.listdir(depth_root)
+    all_samples = []
+    for i, seq_name in enumerate(tqdm(scenes_names)):
+        # load all images
+        all_img_names = get_sorted_files(
+            root=osp.join(depth_root, seq_name, "rgb"), suffix=".png"
+        )
+        all_depth_names = get_sorted_files(
+            root=osp.join(depth_root, seq_name, "depth"), suffix=".png"
+        )
+        seq_len = len(all_img_names)
+        step = sample_len if sample_len > 0 else seq_len
+        for ref_idx in range(0, seq_len, step):
+            print(f"Progress: {seq_name}, {ref_idx // step + 1} / {seq_len//step}")
+            if (ref_idx + step) <= seq_len:
+                ref_e = ref_idx + step
+            else:
+                continue
+            for idx in range(ref_idx, ref_e):
+                im_path = osp.join(
+                    root, seq_name, "rgb", all_img_names[idx]
+                )
+                depth_path = osp.join(
+                    depth_root, seq_name, "depth", all_depth_names[idx]
+                )
+                out_img_path = osp.join(
+                    saved_dir, datatset_name,seq_name, "rgb", all_img_names[idx]
+                )
+                out_depth_path = osp.join(
+                    saved_dir, datatset_name,seq_name, "depth", all_depth_names[idx]
+                )
+                copy_crop_files(
+                    im_path=im_path,
+                    depth_path=depth_path,
+                    out_img_path=out_img_path,
+                    out_depth_path=out_depth_path,
+                    dataset=datatset_name,
+                )
+    # 110 frames like DepthCraft
+    out_json_path = osp.join(saved_dir, datatset_name, "bonn_video.json")
+    gen_json(
+        root_path=osp.join(saved_dir, datatset_name), dataset=datatset_name,
+        start_id=30, end_id=140, step=1, save_path=out_json_path)
+    #~500 frames in paper
+    out_json_path = osp.join(saved_dir, datatset_name, "bonn_video_500.json")
+    gen_json(
+        root_path=osp.join(saved_dir, datatset_name), dataset=datatset_name,
+        start_id=0, end_id=500, step=1, save_path=out_json_path)
+if __name__ == "__main__":
+    extract_bonn(
+        root="path/to/Bonn-RGBD",
+        depth_root="path/to/Bonn-RGBD",
+        saved_dir="./benchmark/datasets/",
+        sample_len=-1,
+        datatset_name="bonn",
+    )

code_depth/benchmark/dataset_extract/dataset_extract_kitti.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os
+import numpy as np
+import os.path as osp
+from PIL import Image
+from tqdm import tqdm
+import csv
+import cv2
+import json
+import glob
+import shutil
+from natsort import natsorted
+from eval_utils import even_or_odd
+from eval_utils import gen_json, get_sorted_files, copy_crop_files
+def extract_kitti(
+    root,
+    depth_root,
+    sample_len=-1,
+    saved_dir="",
+    datatset_name="",
+):
+    scenes_names = os.listdir(depth_root)
+    all_samples = []
+    for i, seq_name in enumerate(tqdm(scenes_names)):
+        all_img_names = get_sorted_files(
+            osp.join(depth_root, seq_name, "proj_depth/groundtruth/image_02"), suffix=".png"
+        )
+        seq_len = len(all_img_names)
+        step = sample_len if sample_len > 0 else seq_len
+        for ref_idx in range(0, seq_len, step):
+            print(f"Progress: {seq_name}, {ref_idx // step + 1} / {seq_len//step}")
+            video_imgs = []
+            video_depths = []
+            if (ref_idx + step) <= seq_len:
+                ref_e = ref_idx + step
+            else:
+                continue
+            for idx in range(ref_idx, ref_e):
+                im_path = osp.join(
+                    root, seq_name[0:10], seq_name, "image_02/data", all_img_names[idx]
+                )
+                depth_path = osp.join(
+                    depth_root, seq_name, "proj_depth/groundtruth/image_02", all_img_names[idx],
+                )
+                out_img_path = osp.join(
+                    saved_dir, datatset_name,seq_name, "rgb", all_img_names[idx]
+                )
+                out_depth_path = osp.join(
+                    saved_dir, datatset_name,seq_name, "depth", all_img_names[idx]
+                )
+                copy_crop_files(
+                    im_path=im_path,
+                    depth_path=depth_path,
+                    out_img_path=out_img_path,
+                    out_depth_path=out_depth_path,
+                    dataset=datatset_name,
+                )
+    # 110 frames like DepthCraft
+    out_json_path = osp.join(saved_dir, datatset_name, "kitti_video.json")
+    gen_json(
+        root_path=osp.join(saved_dir, datatset_name), dataset=datatset_name,
+        start_id=0, end_id=110, step=1, save_path=out_json_path)
+    #~500 frames in paper
+    out_json_path = osp.join(saved_dir, datatset_name, "kitti_video_500.json")
+    gen_json(
+        root_path=osp.join(saved_dir, datatset_name), dataset=datatset_name,
+        start_id=0, end_id=500, step=1, save_path=out_json_path)
+if __name__ == "__main__":
+    extract_kitti(
+        root="path/to/kitti",
+        depth_root="path/to/kitti/val",
+        saved_dir="./benchmark/datasets/",
+        sample_len=-1,
+        datatset_name="kitti",
+    )

code_depth/benchmark/dataset_extract/dataset_extract_nyuv2.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+import numpy as np
+import os.path as osp
+from PIL import Image
+from tqdm import tqdm
+import csv
+import cv2
+import json
+import glob
+from natsort import natsorted
+import shutil
+from eval_utils import gen_json, get_sorted_files, copy_crop_files
+def extract_nyuv2(
+    root,
+    sample_len=-1,
+    datatset_name="",
+    saved_dir="",
+):
+    scenes_names = os.listdir(root)
+    scenes_names = sorted(scenes_names)
+    all_samples = []
+    for i, seq_name in enumerate(tqdm(scenes_names)):
+        all_img_names = get_sorted_files(
+            osp.join(root, seq_name, "rgb"), suffix=".jpg")
+        seq_len = len(all_img_names)
+        step = sample_len if sample_len > 0 else seq_len
+        for ref_idx in range(0, seq_len, step):
+            print(f"Progress: {seq_name}, {ref_idx // step + 1} / {seq_len//step}")
+            if (ref_idx + step) <= seq_len:
+                ref_e = ref_idx + step
+            else:
+                continue
+            for idx in range(ref_idx, ref_e):
+                im_path = osp.join(
+                    root, seq_name, "rgb", all_img_names[idx]
+                )
+                depth_path = osp.join(
+                    root, seq_name, "depth", all_img_names[idx][:-3] + "png"
+                )
+                out_img_path = osp.join(
+                    saved_dir, datatset_name, seq_name, "rgb", all_img_names[idx]
+                )
+                out_depth_path = osp.join(
+                    saved_dir, datatset_name, seq_name, "depth", all_img_names[idx][:-3] + "png"
+                )
+                copy_crop_files(
+                    im_path=im_path,
+                    depth_path=depth_path,
+                    out_img_path=out_img_path,
+                    out_depth_path=out_depth_path,
+                    dataset=dataset_name,
+                )
+    #~500 frames in paper
+    out_json_path = osp.join(saved_dir, datatset_name, "nyuv2_video_500.json")
+    gen_json(
+        root_path=osp.join(saved_dir, datatset_name), dataset=datatset_name,
+        start_id=0,end_id=500,step=1,
+        save_path=out_json_path)
+if __name__ == "__main__":
+    # we use matlab to extract 8 scenes from NYUv2
+    #--basement_0001a, bookstore_0001a, cafe_0001a, classroom_0001a, kitchen_0003, office_0004, playroom_0002, study_0002
+    extract_scannet(
+        root="path/to/nyuv2",
+        saved_dir="./benchmark/datasets/",
+        sample_len=-1,
+        datatset_name="nyuv2",
+    )

code_depth/benchmark/dataset_extract/dataset_extract_scannet.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+import numpy as np
+import os.path as osp
+from PIL import Image
+from tqdm import tqdm
+import csv
+import cv2
+import json
+import glob
+from natsort import natsorted
+import shutil
+from eval_utils import gen_json, gen_json_scannet_tae, get_sorted_files, copy_crop_files
+def extract_scannet(
+    root,
+    sample_len=-1,
+    datatset_name="",
+    saved_dir="",
+):
+    scenes_names = os.listdir(root)
+    scenes_names = sorted(scenes_names)[:100]
+    all_samples = []
+    for i, seq_name in enumerate(tqdm(scenes_names)):
+        all_img_names = get_sorted_files(
+            osp.join(root, seq_name, "color"), suffix=".jpg")
+        all_img_names = all_img_names[:510]
+        seq_len = len(all_img_names)
+        step = sample_len if sample_len > 0 else seq_len
+        for ref_idx in range(0, seq_len, step):
+            print(f"Progress: {seq_name}, {ref_idx // step + 1} / {seq_len//step}")
+            video_imgs = []
+            video_depths = []
+            if (ref_idx + step) <= seq_len:
+                ref_e = ref_idx + step
+            else:
+                continue
+            for idx in range(ref_idx, ref_e):
+                im_path = osp.join(
+                    root, seq_name, "color", all_img_names[idx]
+                )
+                depth_path = osp.join(
+                    root, seq_name, "depth", all_img_names[idx][:-3] + "png"
+                )
+                pose_path = osp.join(
+                    root, seq_name, "pose", all_img_names[idx][:-3] + "txt"
+                )
+                out_img_path = osp.join(
+                    saved_dir, datatset_name, seq_name, "color", all_img_names[idx]
+                )
+                out_depth_path = osp.join(
+                    saved_dir, datatset_name, seq_name, "depth", all_img_names[idx][:-3] + "png"
+                )
+                copy_crop_files(
+                    im_path=im_path,
+                    depth_path=depth_path,
+                    out_img_path=out_img_path,
+                    out_depth_path=out_depth_path,
+                    dataset=datatset_name,
+                )
+                origin_img = np.array(Image.open(im_path))
+                out_img_origin_path = osp.join(
+                    saved_dir, datatset_name, seq_name, "color_origin", all_img_names[idx]
+                )
+                out_pose_path = osp.join(
+                    saved_dir, datatset_name, seq_name, "pose", all_img_names[idx][:-3] + "txt"
+                )
+                os.makedirs(osp.dirname(out_img_origin_path), exist_ok=True)
+                os.makedirs(osp.dirname(out_pose_path), exist_ok=True)
+                cv2.imwrite(
+                    out_img_origin_path,
+                    origin_img,
+                )
+                shutil.copyfile(pose_path, out_pose_path)
+            intrinsic_path = osp.join(
+                root, seq_name, "intrinsic", "intrinsic_depth.txt"
+            )
+            out_intrinsic_path = osp.join(
+                saved_dir, datatset_name, seq_name, "intrinsic", "intrinsic_depth.txt"
+            )
+            os.makedirs(osp.dirname(out_intrinsic_path), exist_ok=True)
+            shutil.copyfile(intrinsic_path, out_intrinsic_path)
+    # 90 frames like DepthCraft
+    out_json_path = osp.join(saved_dir, datatset_name, "scannet_video.json")
+    gen_json(
+        root_path=osp.join(saved_dir, datatset_name), dataset=datatset_name,
+        start_id=0,end_id=90*3,step=3,
+        save_path=out_json_path,
+    )
+    #~500 frames in paper
+    out_json_path = osp.join(saved_dir, datatset_name, "scannet_video_500.json")
+    gen_json(
+        root_path=osp.join(saved_dir, datatset_name), dataset=datatset_name,
+        start_id=0,end_id=500,step=1,
+        save_path=out_json_path,
+    )
+    # tae
+    out_json_path = osp.join(saved_dir, datatset_name, "scannet_video_tae.json")
+    gen_json_scannet_tae(
+        root_path=osp.join(saved_dir, datatset_name),
+        start_id=0,end_id=192,step=1,
+        save_path=out_json_path,
+    )
+if __name__ == "__main__":
+    extract_scannet(
+        root="path/to/scannet",
+        saved_dir="./benchmark/datasets/",
+        sample_len=-1,
+        datatset_name="scannet",
+    )

code_depth/benchmark/dataset_extract/dataset_extract_sintel.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# # Data loading based on https://github.com/NVIDIA/flownet2-pytorch
+import os
+import numpy as np
+import os.path as osp
+from PIL import Image
+from tqdm import tqdm
+import csv
+import imageio
+import cv2
+import json
+import glob
+import shutil
+from eval_utils import gen_json, get_sorted_files
+TAG_FLOAT = 202021.25
+TAG_CHAR = "PIEH"
+def depth_read(filename):
+    """Read depth data from file, return as numpy array."""
+    f = open(filename, "rb")
+    check = np.fromfile(f, dtype=np.float32, count=1)[0]
+    assert (
+        check == TAG_FLOAT
+    ), " depth_read:: Wrong tag in flow file (should be: {0}, is: {1}). Big-endian machine? ".format(
+        TAG_FLOAT, check
+    )
+    width = np.fromfile(f, dtype=np.int32, count=1)[0]
+    height = np.fromfile(f, dtype=np.int32, count=1)[0]
+    size = width * height
+    assert (
+        width > 0 and height > 0 and size > 1 and size < 100000000
+    ), " depth_read:: Wrong input size (width = {0}, height = {1}).".format(
+        width, height
+    )
+    depth = np.fromfile(f, dtype=np.float32, count=-1).reshape((height, width))
+    return depth
+def extract_sintel(
+    root,
+    depth_root,
+    sample_len=-1,
+    datatset_name="",
+    saved_dir="",
+):
+    scenes_names = os.listdir(root)
+    all_samples = []
+    for i, seq_name in enumerate(tqdm(scenes_names)):
+        all_img_names = get_sorted_files(
+            os.path.join(root, seq_name), suffix=".png")
+        seq_len = len(all_img_names)
+        step = sample_len if sample_len > 0 else seq_len
+        for ref_idx in range(0, seq_len, step):
+            print(f"Progress: {seq_name}, {ref_idx // step} / {seq_len // step}")
+            if (ref_idx + step) <= seq_len:
+                ref_e = ref_idx + step
+            else:
+                continue
+            for idx in range(ref_idx, ref_e):
+                im_path = osp.join(
+                    root, seq_name, all_img_names[idx]
+                )
+                depth_path = osp.join(
+                    depth_root, seq_name, all_img_names[idx][:-3] + "dpt"
+                )
+                out_img_path = osp.join(
+                    saved_dir, datatset_name,'clean', seq_name, all_img_names[idx]
+                )
+                out_depth_path = osp.join(
+                    saved_dir, datatset_name,'depth', seq_name, all_img_names[idx][:-3] + "png"
+                )
+                depth = depth_read(depth_path)
+                img = np.array(Image.open(im_path))
+                os.makedirs(osp.dirname(out_img_path), exist_ok=True)
+                os.makedirs(osp.dirname(out_depth_path), exist_ok=True)
+                cv2.imwrite(
+                    out_img_path,
+                    img,
+                )
+                cv2.imwrite(
+                    out_depth_path,
+                    depth.astype(np.uint16)
+                )
+    gen_json(
+        root_path=osp.join(saved_dir, datatset_name), dataset=datatset_name,
+        start_id=0,end_id=100,step=1,
+        save_path=osp.join(saved_dir, datatset_name, "sintel_video.json"),)
+if __name__ == "__main__":
+    extract_sintel(
+        root="path/to/training/clean",
+        depth_root="path/to/depth",
+        saved_dir="./benchmark/datasets/",
+        sample_len=-1,
+        datatset_name="sintel",
+    )

code_depth/benchmark/dataset_extract/eval_utils.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+import numpy as np
+import os.path as osp
+import json
+import glob
+import cv2
+import shutil
+from PIL import Image
+from natsort import natsorted
+def even_or_odd(num):
+    if num % 2 == 0:
+        return num
+    else:
+        return num - 1
+def gen_json(root_path, dataset, start_id, end_id, step, save_path=None):
+    rgb_name = "rgb"
+    if dataset == "kitti":
+        factor = 256.0
+    elif dataset == "nyuv2":
+        factor = 6000.0
+    elif dataset == "bonn":
+        factor = 5000.0
+    elif dataset == 'sintel':
+        factor = 65535 / 650
+        rgb_name = "clean"
+    elif dataset == 'scannet':
+        factor = 1000.0
+        rgb_name = "color"
+    else:
+        raise NotImplementedError
+    data = {}
+    data[dataset] = []
+    pieces  = glob.glob(osp.join(root_path, "*"))
+    count = 0
+    for piece in pieces:
+        if not osp.isdir(piece):
+            continue
+        name = piece.split('/')[-1]
+        name_dict = {name:[]}
+        images = glob.glob(osp.join(piece, rgb_name, "*.png")) + glob.glob(osp.join(piece, rgb_name, "*.jpg"))
+        images = natsorted(images)
+        depths = glob.glob(osp.join(piece, "depth/*.png"))
+        depths = natsorted(depths)
+        images = images[start_id:end_id:step]
+        depths = depths[start_id:end_id:step]
+        for i in range(len(images)):
+            image = images[i]
+            xx = image[len(root_path)+1:]
+            depth = depths[i][len(root_path)+1:]
+            tmp = {}
+            tmp["image"] = xx
+            tmp["gt_depth"] = depth
+            tmp["factor"] = factor
+            name_dict[name].append(tmp)
+        data[dataset].append(name_dict)
+    with open(save_path, "w") as f:
+        json.dump(data, f, indent= 4)
+def gen_json_scannet_tae(root_path, start_id, end_id, step, save_path=None):
+    data = {}
+    data["scannet"] = []
+    pieces  = glob.glob(osp.join(root_path, "*"))
+    color =  'color_origin'
+    for piece in pieces:
+        if not osp.isdir(piece):
+            continue
+        name = piece.split('/')[-1]
+        name_dict = {name:[]}
+        images = glob.glob(osp.join(piece,color, "*.jpg"))
+        images = natsorted(images)
+        depths = glob.glob(osp.join(piece, "depth/*.png"))
+        depths = natsorted(depths)
+        images = images[start_id:end_id:step]
+        depths = depths[start_id:end_id:step]
+        print(f"sequence frame number: {piece}")
+        count = 0
+        for i in range(len(images)):
+            image = images[i]
+            xx = image[len(root_path)+1:]
+            depth = depths[i][len(root_path)+1:]
+            base_path = osp.dirname(image)
+            base_path = base_path.replace(color, 'intrinsic')
+            K = np.loadtxt(base_path + '/intrinsic_depth.txt')
+            pose_path = image.replace(color, 'pose').replace('.jpg', '.txt')
+            pose = np.loadtxt(pose_path)
+            tmp = {}
+            tmp["image"] = xx
+            tmp["gt_depth"] = depth
+            tmp["factor"] = 1000.0
+            tmp["K"] = K.tolist()
+            tmp["pose"] = pose.tolist()
+            name_dict[name].append(tmp)
+        data["scannet"].append(name_dict)
+    with open(save_path, "w") as f:
+        json.dump(data, f, indent= 4)
+def get_sorted_files(root_path, suffix):
+    all_img_names = os.listdir(root_path)
+    all_img_names = [x for x in all_img_names if x.endswith(suffix)]
+    print(f"sequence frame number: {len(all_img_names)}")
+    all_img_names.sort()
+    all_img_names = sorted(all_img_names, key=lambda x: int(x.split(".")[0][-4:]))
+    return all_img_names
+def copy_crop_files(im_path, depth_path, out_img_path, out_depth_path, dataset):
+    img = np.array(Image.open(im_path))
+    if dataset == "kitti" or dataset == "bonn":
+        height, width = img.shape[:2]
+        height = even_or_odd(height)
+        width = even_or_odd(width)
+        img = img[:height, :width]
+    elif dataset == "nyuv2":
+        img = img[45:471, 41:601, :]
+    elif dataset == "scannet":
+        img = img[8:-8, 11:-11, :]
+    os.makedirs(osp.dirname(out_img_path), exist_ok=True)
+    os.makedirs(osp.dirname(out_depth_path), exist_ok=True)
+    cv2.imwrite(
+        out_img_path,
+        img,
+    )
+    shutil.copyfile(depth_path, out_depth_path)

code_depth/benchmark/eval/eval.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import numpy as np
+import cv2
+import matplotlib.pyplot as plt
+import json
+import argparse
+from scipy.ndimage import map_coordinates
+from tqdm import tqdm
+import os
+import gc
+import torch
+from metric import *
+import metric
+device = 'cuda'
+eval_metrics = [
+    "abs_relative_difference",
+    "rmse_linear",
+    "delta1_acc",
+]
+def get_infer(infer_path,args, target_size = None):
+    if infer_path.split('.')[-1] == 'npy':
+        img_gray = np.load(infer_path)
+        img_gray = img_gray.astype(np.float32)
+        infer_factor = 1.0
+    else:
+        img = cv2.imread(infer_path)
+        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        img_gray = img_gray.astype(np.float32)
+        infer_factor = 1.0 / 255.0
+    infer = img_gray / infer_factor
+    if target_size is not None:
+        if infer.shape[0] != target_size[0] or infer.shape[1] != target_size[1]:
+            infer = cv2.resize(infer, (target_size[1], target_size[0]))
+    return infer
+def get_gt(depth_gt_path, gt_factor, args):
+    if depth_gt_path.split('.')[-1] == 'npy':
+        depth_gt = np.load(depth_gt_path)
+    else:
+        depth_gt = cv2.imread(depth_gt_path, -1)
+        depth_gt = np.array(depth_gt)
+    depth_gt = depth_gt / gt_factor
+    depth_gt[depth_gt==0] = -1
+    return depth_gt
+def get_flow(flow_path):
+    assert os.path.exists(flow_path)
+    flow = np.load(flow_path, allow_pickle=True)
+    return flow
+def depth2disparity(depth, return_mask=False):
+    if isinstance(depth, np.ndarray):
+        disparity = np.zeros_like(depth)
+    non_negtive_mask = depth > 0
+    disparity[non_negtive_mask] = 1.0 / depth[non_negtive_mask]
+    if return_mask:
+        return disparity, non_negtive_mask
+    else:
+        return disparity
+def eval_depthcrafter(infer_paths, depth_gt_paths, factors, args):
+    depth_errors = []
+    gts = []
+    infs = []
+    seq_length = args.max_eval_len
+    dataset_max_depth = args.max_depth_eval
+    for i in range(len(infer_paths)):
+        if not os.path.exists(infer_paths[i]):
+            continue
+        depth_gt = get_gt(depth_gt_paths[i], factors[i], args)
+        depth_gt = depth_gt[args.a:args.b, args.c:args.d]
+        infer = get_infer(infer_paths[i], args, target_size=depth_gt.shape)
+        gts.append(depth_gt)
+        infs.append(infer)
+    gts = np.stack(gts, axis=0)
+    infs = np.stack(infs, axis=0)
+    infs = infs[:seq_length]
+    gts = gts[:seq_length]
+    valid_mask = np.logical_and((gts>1e-3), (gts<dataset_max_depth))
+    gt_disp_masked = 1. / (gts[valid_mask].reshape((-1,1)).astype(np.float64) + 1e-8)
+    infs = np.clip(infs, a_min=1e-3, a_max=None)
+    pred_disp_masked = infs[valid_mask].reshape((-1,1)).astype(np.float64)
+    _ones = np.ones_like(pred_disp_masked)
+    A = np.concatenate([pred_disp_masked, _ones], axis=-1)
+    X = np.linalg.lstsq(A, gt_disp_masked, rcond=None)[0]
+    scale, shift = X
+    aligned_pred = scale * infs + shift
+    aligned_pred = np.clip(aligned_pred, a_min=1e-3, a_max=None)
+    pred_depth = depth2disparity(aligned_pred)
+    gt_depth = gts
+    pred_depth = np.clip(
+            pred_depth, a_min=1e-3, a_max=dataset_max_depth
+        )
+    sample_metric = []
+    metric_funcs = [getattr(metric, _met) for _met in eval_metrics]
+    pred_depth_ts = torch.from_numpy(pred_depth).to(device)
+    gt_depth_ts = torch.from_numpy(gt_depth).to(device)
+    valid_mask_ts = torch.from_numpy(valid_mask).to(device)
+    n = valid_mask.sum((-1, -2))
+    valid_frame = (n > 0)
+    pred_depth_ts = pred_depth_ts[valid_frame]
+    gt_depth_ts = gt_depth_ts[valid_frame]
+    valid_mask_ts = valid_mask_ts[valid_frame]
+    for met_func in metric_funcs:
+        _metric_name = met_func.__name__
+        _metric = met_func(pred_depth_ts, gt_depth_ts, valid_mask_ts).item()
+        sample_metric.append(_metric)
+    return sample_metric
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--infer_path', type=str, default='')
+    parser.add_argument('--infer_type', type=str, default='npy')
+    parser.add_argument('--benchmark_path', type=str, default='')
+    parser.add_argument('--datasets', type=str, nargs='+', default=['vkitti', 'kitti', 'sintel', 'nyu_v2', 'tartanair', 'bonn', 'ip_lidar'])
+    args = parser.parse_args()
+    results_save_path = os.path.join(args.infer_path, 'results.txt')
+    for dataset in args.datasets:
+        file = open(results_save_path, 'a')
+        if dataset == 'kitti':
+            args.json_file = os.path.join(args.benchmark_path,'kitti/kitti_video.json')
+            args.root_path = os.path.join(args.benchmark_path,'kitti')
+            args.max_depth_eval = 80.0
+            args.min_depth_eval = 0.1
+            args.max_eval_len = 110
+            args.a = 0
+            args.b = 374
+            args.c = 0
+            args.d = 1242
+        if dataset == 'kitti_500':
+            dataset = 'kitti'
+            args.json_file = os.path.join(args.benchmark_path,'kitti/kitti_video_500.json')
+            args.root_path = os.path.join(args.benchmark_path,'kitti')
+            args.max_depth_eval = 80.0
+            args.min_depth_eval = 0.1
+            args.max_eval_len = 500
+            args.a = 0
+            args.b = 374
+            args.c = 0
+            args.d = 1242
+        elif dataset == 'sintel':
+            args.json_file = os.path.join(args.benchmark_path,'sintel/sintel_video.json')
+            args.root_path = os.path.join(args.benchmark_path,'sintel')
+            args.max_depth_eval = 70
+            args.min_depth_eval = 0.1
+            args.max_eval_len = 100
+            args.a = 0
+            args.b = 436
+            args.c = 0
+            args.d = 1024
+        elif dataset == 'nyuv2_500':
+            dataset = 'nyuv2'
+            args.json_file = os.path.join(args.benchmark_path,'nyuv2/nyuv2_video_500.json')
+            args.root_path = os.path.join(args.benchmark_path,'nyuv2')
+            args.max_depth_eval = 10.0
+            args.min_depth_eval = 0.1
+            args.max_eval_len = 500
+            args.a = 45
+            args.b = 471
+            args.c = 41
+            args.d = 601
+        elif dataset == 'bonn':
+            args.json_file = os.path.join(args.benchmark_path,'bonn/bonn_video.json')
+            args.root_path = os.path.join(args.benchmark_path,'bonn')
+            args.max_depth_eval = 10.0
+            args.min_depth_eval = 0.1
+            args.max_eval_len = 110
+            args.a = 0
+            args.b = 480
+            args.c = 0
+            args.d = 640
+        elif dataset == 'bonn_500':
+            dataset = 'bonn'
+            args.json_file = os.path.join(args.benchmark_path,'bonn/bonn_video_500.json')
+            args.root_path = os.path.join(args.benchmark_path,'bonn')
+            args.max_depth_eval = 10.0
+            args.min_depth_eval = 0.1
+            args.max_eval_len = 500
+            args.a = 0
+            args.b = 480
+            args.c = 0
+            args.d = 640
+        elif dataset == 'scannet':
+            args.json_file = os.path.join(args.benchmark_path,'scannet/scannet_video.json')
+            args.root_path = os.path.join(args.benchmark_path,'scannet')
+            args.max_depth_eval = 10.0
+            args.min_depth_eval = 0.1
+            args.max_eval_len = 90
+            args.a = 8
+            args.b = -8
+            args.c = 11
+            args.d = -11
+        elif dataset == 'scannet_500':
+            dataset = 'scannet'
+            args.json_file = os.path.join(args.benchmark_path,'scannet/scannet_video_500.json')
+            args.root_path = os.path.join(args.benchmark_path,'scannet')
+            args.max_depth_eval = 10.0
+            args.min_depth_eval = 0.1
+            args.max_eval_len = 500
+            args.a = 8
+            args.b = -8
+            args.c = 11
+            args.d = -11
+        with open(args.json_file, 'r') as fs:
+            path_json = json.load(fs)
+        json_data = path_json[dataset]
+        scale_stds = shift_stds = stable_result_fulls = stable_result_wins = 0
+        depth_result_fulls = np.zeros(5)
+        depth_result_wins = np.zeros(5)
+        depth_result_onlys = np.zeros(5)
+        count = 0
+        line = '-' * 50
+        print(f'<{line} {dataset} start {line}>')
+        file.write(f'<{line} {dataset} start {line}>\n')
+        results_all = []
+        for data in tqdm(json_data):
+            for key in data.keys():
+                value = data[key]
+                infer_paths = []
+                depth_gt_paths = []
+                flow_paths = []
+                factors = []
+                for images in value:
+                    infer_path = (args.infer_path + '/'+ dataset + '/' + images['image']).replace('.jpg', '.npy').replace('.png', '.npy')
+                    infer_paths.append(infer_path)
+                    depth_gt_paths.append(args.root_path + '/' + images['gt_depth'])
+                    factors.append(images['factor'])
+                infer_paths = infer_paths[:args.max_eval_len]
+                depth_gt_paths = depth_gt_paths[:args.max_eval_len]
+                factors = factors[:args.max_eval_len]
+                results_single = eval_depthcrafter(infer_paths, depth_gt_paths, factors, args)
+                results_all.append(results_single)
+        final_results =  np.array(results_all)
+        final_results_mean = np.mean(final_results, axis=0)
+        result_dict = { 'name': dataset }
+        for i, metric in enumerate(eval_metrics):
+            result_dict[metric] = final_results_mean[i]
+            print(f"{metric}: {final_results_mean[i]:04f}")
+            file.write(f"{metric}: {final_results_mean[i]:04f}\n")
+        file.write(f'<{line} {dataset} finish {line}>\n')
+if __name__ == '__main__':
+    main()

code_depth/benchmark/eval/eval.sh ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/bin/sh
+set -x
+set -e
+pred_disp_root=$1 # The parent directory that contaning [sintel, scannet, KITTI, bonn, NYUv2] prediction
+benchmark_root=$2 # The parent directory that contaning [sintel, scannet, KITTI, bonn, NYUv2] ground truth
+#eval sintel
+python3 benchmark/eval/eval.py \
+    --infer_path $pred_disp_root \
+    --benchmark_path $benchmark_root \
+    --datasets sintel
+#eval scannet
+python3 benchmark/eval/eval.py \
+    --infer_path $pred_disp_root \
+    --benchmark_path $benchmark_root \
+    --datasets scannet
+#eval kitti
+python3 benchmark/eval/eval.py \
+    --infer_path $pred_disp_root \
+    --benchmark_path $benchmark_root \
+    --datasets kitti
+#eval bonn
+python3 benchmark/eval/eval.py \
+    --infer_path $pred_disp_root \
+    --benchmark_path $benchmark_root \
+    --datasets bonn

code_depth/benchmark/eval/eval_500.sh ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/bin/sh
+set -x
+set -e
+pred_disp_root=$1 # The parent directory that contaning [sintel, scannet, KITTI, bonn, NYUv2] prediction
+benchmark_root=$2 # The parent directory that contaning [sintel, scannet, KITTI, bonn, NYUv2] ground truth
+#eval scannet
+python3 benchmark/eval/eval.py \
+    --infer_path $pred_disp_root \
+    --benchmark_path $benchmark_root \
+    --datasets scannet_500
+#eval kitti
+python3 benchmark/eval/eval.py \
+    --infer_path $pred_disp_root \
+    --benchmark_path $benchmark_root \
+    --datasets kitti_500
+#eval bonn
+python3 benchmark/eval/eval.py \
+    --infer_path $pred_disp_root \
+    --benchmark_path $benchmark_root \
+    --datasets bonn_500
+#eval nyu
+python3 benchmark/eval/eval.py \
+    --infer_path $pred_disp_root \
+    --benchmark_path $benchmark_root \
+    --datasets nyuv2_500

code_depth/benchmark/eval/eval_tae.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import numpy as np
+import cv2
+import matplotlib.pyplot as plt
+import json
+import argparse
+from scipy.ndimage import map_coordinates
+from tqdm import tqdm
+import os
+import gc
+import time
+import torch
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def compute_errors_torch(gt, pred):
+    abs_rel = torch.mean(torch.abs(gt - pred) / gt)
+    return abs_rel
+def get_infer(infer_path,args, target_size = None):
+    if infer_path.split('.')[-1] == 'npy':
+        img_gray = np.load(infer_path)
+        img_gray = img_gray.astype(np.float32)
+        infer_factor = 1.0
+    else:
+        img = cv2.imread(infer_path)
+        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        img_gray = img_gray.astype(np.float32)
+        infer_factor = 1.0 / 255.0
+    infer = img_gray / infer_factor
+    if args.hard_crop:
+        infer = infer[args.a:args.b, args.c:args.d]
+    if target_size is not None:
+        if infer.shape[0] != target_size[0] or infer.shape[1] != target_size[1]:
+            infer = cv2.resize(infer, (target_size[1], target_size[0]))
+    return infer
+def get_gt(depth_gt_path, gt_factor, args):
+    if depth_gt_path.split('.')[-1] == 'npy':
+        depth_gt = np.load(depth_gt_path)
+    else:
+        depth_gt = cv2.imread(depth_gt_path, -1)
+        depth_gt = np.array(depth_gt)
+    depth_gt = depth_gt / gt_factor
+    depth_gt[depth_gt==0] = 0
+    return depth_gt
+def depth2disparity(depth, return_mask=False):
+    if isinstance(depth, np.ndarray):
+        disparity = np.zeros_like(depth)
+    non_negtive_mask = depth > 0
+    disparity[non_negtive_mask] = 1.0 / depth[non_negtive_mask]
+    if return_mask:
+        return disparity, non_negtive_mask
+    else:
+        return disparity
+def tae_torch(depth1, depth2, R_2_1, T_2_1, K, mask):
+    H, W = depth1.shape
+    fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
+    # Generate meshgrid
+    xx, yy = torch.meshgrid(torch.arange(W), torch.arange(H))
+    xx, yy = xx.t(), yy.t()  # Transpose to match the shape (H, W)
+    # Convert meshgrid to tensor
+    xx = xx.to(dtype=depth1.dtype, device=depth1.device)
+    yy = yy.to(dtype=depth1.dtype, device=depth1.device)
+    # Calculate 3D points in frame 1
+    X = (xx - cx) * depth1 / fx
+    Y = (yy - cy) * depth1 / fy
+    Z = depth1
+    points3d = torch.stack((X.flatten(), Y.flatten(), Z.flatten()), dim=1)  # Shape (H*W, 3)
+    T = torch.tensor(T_2_1, dtype=depth1.dtype, device=depth1.device)
+    # Transform 3D points to frame 2
+    points3d_transformed = torch.matmul(points3d, R_2_1.T) + T
+    X_world, Y_world, Z_world = points3d_transformed[:, 0], points3d_transformed[:, 1], points3d_transformed[:, 2]
+    # Project 3D points to 2D plane using intrinsic matrix
+    X_plane = (X_world * fx) / Z_world + cx
+    Y_plane = (Y_world * fy) / Z_world + cy
+    # Round and convert to integers
+    X_plane = torch.round(X_plane).to(dtype=torch.long)
+    Y_plane = torch.round(Y_plane).to(dtype=torch.long)
+    # Filter valid indices
+    valid_mask = (X_plane >= 0) & (X_plane < W) & (Y_plane >= 0) & (Y_plane < H)
+    if valid_mask.sum() == 0:
+        return 0
+    depth_proj = torch.zeros((H, W), dtype=depth1.dtype, device=depth1.device)
+    valid_X = X_plane[valid_mask]
+    valid_Y = Y_plane[valid_mask]
+    valid_Z = Z_world[valid_mask]
+    depth_proj[valid_Y, valid_X] = valid_Z
+    valid_mask = (depth_proj > 0) & (depth2 > 0) & (mask)
+    if valid_mask.sum() == 0:
+        return 0
+    abs_errors = compute_errors_torch(depth2[valid_mask], depth_proj[valid_mask])
+    return abs_errors
+def eval_TAE(infer_paths, depth_gt_paths, factors, masks, Ks, poses, args):
+    gts = []
+    infs = []
+    dataset_max_depth = args.max_depth_eval
+    gt_paths_cur = []
+    Ks_cur = []
+    poses_cur = []
+    masks_cur = []
+    for i in range(len(infer_paths)):
+        # DAV missing some frames
+        if not os.path.exists(infer_paths[i]):
+            continue
+        depth_gt = get_gt(depth_gt_paths[i], factors[i], args)
+        depth_gt = depth_gt[args.a:args.b, args.c:args.d]
+        gt_paths_cur.append(depth_gt_paths[i])
+        infer = get_infer(infer_paths[i], args, target_size=depth_gt.shape)
+        gts.append(depth_gt)
+        infs.append(infer)
+        Ks_cur.append(Ks[i])
+        poses_cur.append(poses[i])
+        if args.mask:
+            masks_cur.append(masks[i])
+    gts = np.stack(gts, axis=0)
+    infs = np.stack(infs, axis=0)
+    valid_mask = np.logical_and((gts>1e-3), (gts<dataset_max_depth))
+    gt_disp_masked = 1. / (gts[valid_mask].reshape((-1,1)).astype(np.float64) + 1e-8)
+    infs = np.clip(infs, a_min=1e-3, a_max=None)
+    pred_disp_masked = infs[valid_mask].reshape((-1,1)).astype(np.float64)
+    _ones = np.ones_like(pred_disp_masked)
+    A = np.concatenate([pred_disp_masked, _ones], axis=-1)
+    X = np.linalg.lstsq(A, gt_disp_masked, rcond=None)[0]
+    scale, shift = X
+    aligned_pred = scale * infs + shift
+    aligned_pred = np.clip(aligned_pred, a_min=1e-3, a_max=None)
+    pred_depth = depth2disparity(aligned_pred)
+    gt_depth = gts
+    pred_depth = np.clip(
+            pred_depth, a_min=1e-3, a_max=dataset_max_depth
+        )
+    error_sum = 0.
+    for i in range(len(gt_paths_cur) -1):
+        depth1 = pred_depth[i]
+        depth2 = pred_depth[i+1]
+        gt_depth1 = gt_paths_cur[i]
+        gt_depth2 = gt_paths_cur[i+1]
+        T_1 = poses_cur[i]
+        T_2 = poses_cur[i+1]
+        T_2_1 = np.linalg.inv(T_2) @ T_1
+        R_2_1 = T_2_1[:3,:3]
+        t_2_1 = T_2_1[:3, 3]
+        K = Ks_cur[i]
+        if args.mask:
+            mask_path1 = masks_cur[i]
+            mask_path2 = masks_cur[i+1]
+            mask1 = cv2.imread(mask_path1, -1)
+            mask2 = cv2.imread(mask_path2, -1)
+            mask1 = mask1[args.a:args.b, args.c:args.d]
+            if mask2 is None:
+                mask2 = np.ones_like(mask1)
+            else:
+                mask2 = mask2[args.a:args.b, args.c:args.d]
+            mask1 = mask1 > 0
+            mask2 = mask2 > 0
+        else:
+            mask1 = np.ones_like(depth1)
+            mask2 = np.ones_like(depth2)
+            mask1 = mask1 > 0
+            mask2 = mask2 > 0
+        depth1 = torch.from_numpy(depth1).to(device=device)
+        depth2 = torch.from_numpy(depth2).to(device=device)
+        R_2_1 = torch.from_numpy(R_2_1).to(device=device)
+        t_2_1 = torch.from_numpy(t_2_1).to(device=device)
+        mask1 = torch.from_numpy(mask1).to(device=device)
+        mask2 = torch.from_numpy(mask2).to(device=device)
+        error1 = tae_torch(depth1, depth2, R_2_1, t_2_1, K, mask2)
+        T_1_2 = np.linalg.inv(T_2_1)
+        R_1_2 = T_1_2[:3,:3]
+        t_1_2 = T_1_2[:3, 3]
+        R_1_2 = torch.from_numpy(R_1_2).to(device=device)
+        t_1_2 = torch.from_numpy(t_1_2).to(device=device)
+        error2 = tae_torch(depth2, depth1, R_1_2, t_1_2, K, mask1)
+        error_sum += error1
+        error_sum += error2
+    gc.collect()
+    result = error_sum / (2 * (len(gt_paths_cur) -1))
+    return result*100
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--infer_path', type=str, default='')
+    parser.add_argument('--benchmark_path', type=str, default='')
+    parser.add_argument('--datasets', type=str, nargs='+', default=['scannet', 'sintel'])
+    parser.add_argument('--start_idx', type=int, default=0)
+    parser.add_argument('--end_idx', type=int, default=180)
+    parser.add_argument('--eval_scenes_num', type=int, default=20)
+    parser.add_argument('--hard_crop', action='store_true', default=False)
+    args = parser.parse_args()
+    results_save_path = os.path.join(args.infer_path, 'results.txt')
+    for dataset in args.datasets:
+        file = open(results_save_path, 'a')
+        if dataset == 'scannet':
+            args.json_file = os.path.join(args.benchmark_path,'scannet/scannet_video.json')
+            args.root_path = os.path.join(args.benchmark_path, 'scannet/')
+            args.max_depth_eval = 10.0
+            args.min_depth_eval = 0.1
+            args.max_eval_len = 200
+            args.mask = False
+            #DepthCrafer crop
+            args.a = 8
+            args.b = -8
+            args.c = 11
+            args.d = -11
+        with open(args.json_file, 'r') as fs:
+            path_json = json.load(fs)
+        json_data = path_json[dataset]
+        count = 0
+        line = '-' * 50
+        print(f'<{line} {dataset} start {line}>')
+        file.write(f'<{line} {dataset} start {line}>\n')
+        results_all = 0.
+        for data in tqdm(json_data[:args.eval_scenes_num]):
+            for scene_name in data.keys():
+                value = data[scene_name]
+                infer_paths = []
+                depth_gt_paths = []
+                factors = []
+                Ks = []
+                poses = []
+                masks = []
+                for images in value:
+                    infer_path = (args.infer_path + '/'+ dataset + '/' + images['image']).replace('.jpg', '.npy').replace('.png', '.npy')
+                    infer_paths.append(infer_path)
+                    depth_gt_paths.append(args.root_path + '/' + images['gt_depth'])
+                    factors.append(images['factor'])
+                    Ks.append(np.array(images['K']))
+                    poses.append(np.array(images['pose']))
+                    if args.mask:
+                        masks.append(args.root_path + '/' + images['mask'])
+            infer_paths = infer_paths[args.start_idx:args.end_idx]
+            depth_gt_paths = depth_gt_paths[args.start_idx:args.end_idx]
+            factors = factors[args.start_idx:args.end_idx]
+            poses = poses[args.start_idx:args.end_idx]
+            Ks = Ks[args.start_idx:args.end_idx]
+            error = eval_TAE(infer_paths, depth_gt_paths, factors,masks,Ks,poses,args)
+            results_all += error
+            count += 1
+        print(dataset,': ','tae ', results_all / count)
+        file.write(f'{dataset}: {results_all / count}\n')
+        file.write(f'<{line} {dataset} finish {line}>\n')

code_depth/benchmark/eval/eval_tae.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/sh
+set -x
+set -e
+pred_disp_root=$1 # The parent directory that contaning [sintel, scannet, KITTI, bonn, NYUv2] prediction
+benchmark_root=$2 # The parent directory that contaning [sintel, scannet, KITTI, bonn, NYUv2] ground truth
+#eval scannet
+python3 benchmark/eval/eval_tae.py \
+    --infer_path $pred_disp_root \
+    --benchmark_path $benchmark_root \
+    --datasets scannet \
+    --start_idx 10 \
+    --end_idx 180 \
+    --eval_scenes_num 20 \
+    --hard_crop

code_depth/benchmark/eval/metric.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import torch
+def abs_relative_difference(output, target, valid_mask=None):
+    actual_output = output
+    actual_target = target
+    abs_relative_diff = torch.abs(actual_output - actual_target) / actual_target
+    if valid_mask is not None:
+        abs_relative_diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    abs_relative_diff = torch.sum(abs_relative_diff, (-1, -2)) / n
+    return abs_relative_diff.mean()
+def squared_relative_difference(output, target, valid_mask=None):
+    actual_output = output
+    actual_target = target
+    square_relative_diff = (
+        torch.pow(torch.abs(actual_output - actual_target), 2) / actual_target
+    )
+    if valid_mask is not None:
+        square_relative_diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    square_relative_diff = torch.sum(square_relative_diff, (-1, -2)) / n
+    return square_relative_diff.mean()
+def rmse_linear(output, target, valid_mask=None):
+    actual_output = output
+    actual_target = target
+    diff = actual_output - actual_target
+    if valid_mask is not None:
+        diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    diff2 = torch.pow(diff, 2)
+    mse = torch.sum(diff2, (-1, -2)) / n
+    rmse = torch.sqrt(mse)
+    return rmse.mean()
+def rmse_log(output, target, valid_mask=None):
+    diff = torch.log(output) - torch.log(target)
+    if valid_mask is not None:
+        diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    diff2 = torch.pow(diff, 2)
+    mse = torch.sum(diff2, (-1, -2)) / n  # [B]
+    rmse = torch.sqrt(mse)
+    return rmse.mean()
+def log10(output, target, valid_mask=None):
+    if valid_mask is not None:
+        diff = torch.abs(
+            torch.log10(output[valid_mask]) - torch.log10(target[valid_mask])
+        )
+    else:
+        diff = torch.abs(torch.log10(output) - torch.log10(target))
+    return diff.mean()
+# adapt from: https://github.com/imran3180/depth-map-prediction/blob/master/main.py
+def threshold_percentage(output, target, threshold_val, valid_mask=None):
+    d1 = output / target
+    d2 = target / output
+    max_d1_d2 = torch.max(d1, d2)
+    zero = torch.zeros(*output.shape)
+    one = torch.ones(*output.shape)
+    bit_mat = torch.where(max_d1_d2.cpu() < threshold_val, one, zero)
+    if valid_mask is not None:
+        bit_mat[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    count_mat = torch.sum(bit_mat, (-1, -2))
+    threshold_mat = count_mat / n.cpu()
+    return threshold_mat.mean()
+def delta1_acc(pred, gt, valid_mask):
+    return threshold_percentage(pred, gt, 1.25, valid_mask)
+def delta2_acc(pred, gt, valid_mask):
+    return threshold_percentage(pred, gt, 1.25**2, valid_mask)
+def delta3_acc(pred, gt, valid_mask):
+    return threshold_percentage(pred, gt, 1.25**3, valid_mask)
+def i_rmse(output, target, valid_mask=None):
+    output_inv = 1.0 / output
+    target_inv = 1.0 / target
+    diff = output_inv - target_inv
+    if valid_mask is not None:
+        diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    diff2 = torch.pow(diff, 2)
+    mse = torch.sum(diff2, (-1, -2)) / n  # [B]
+    rmse = torch.sqrt(mse)
+    return rmse.mean()
+def silog_rmse(depth_pred, depth_gt, valid_mask=None):
+    diff = torch.log(depth_pred) - torch.log(depth_gt)
+    if valid_mask is not None:
+        diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = depth_gt.shape[-2] * depth_gt.shape[-1]
+    diff2 = torch.pow(diff, 2)
+    first_term = torch.sum(diff2, (-1, -2)) / n
+    second_term = torch.pow(torch.sum(diff, (-1, -2)), 2) / (n**2)
+    loss = torch.sqrt(torch.mean(first_term - second_term)) * 100
+    return loss

code_depth/benchmark/infer/infer.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import argparse
+import os
+import cv2
+import json
+import torch
+from tqdm import tqdm
+import numpy as np
+from video_depth_anything.video_depth import VideoDepthAnything
+from utils.dc_utils import read_video_frames
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--infer_path', type=str, default='')
+    parser.add_argument('--json_file', type=str, default='')
+    parser.add_argument('--datasets', type=str, nargs='+', default=['scannet', 'nyuv2'])
+    parser.add_argument('--input_size', type=int, default=518)
+    parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitl'])
+    args = parser.parse_args()
+    for dataset in args.datasets:
+        with open(args.json_file, 'r') as fs:
+            path_json = json.load(fs)
+        DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+        model_configs = {
+            'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+            'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+        }
+        video_depth_anything = VideoDepthAnything(**model_configs[args.encoder])
+        video_depth_anything.load_state_dict(torch.load(f'./checkpoints/video_depth_anything_{args.encoder}.pth', map_location='cpu'), strict=True)
+        video_depth_anything = video_depth_anything.to(DEVICE).eval()
+        json_data = path_json[dataset]
+        root_path = os.path.dirname(args.json_file)
+        for data in tqdm(json_data):
+             for key in data.keys():
+                value = data[key]
+                infer_paths = []
+                videos = []
+                for images in value:
+                    image_path = os.path.join(root_path, images['image'])
+                    infer_path = (args.infer_path + '/'+ dataset + '/' + images['image']).replace('.jpg', '.npy').replace('.png', '.npy')
+                    infer_paths.append(infer_path)
+                    img = cv2.imread(image_path)
+                    videos.append(img)
+                videos = np.stack(videos, axis=0)
+                target_fps=1
+                depths, fps = video_depth_anything.infer_video_depth(videos, target_fps, input_size=args.input_size, device=DEVICE, fp32=True)
+                for i in range(len(infer_paths)):
+                    infer_path = infer_paths[i]
+                    os.makedirs(os.path.dirname(infer_path), exist_ok=True)
+                    depth = depths[i]
+                    np.save(infer_path, depth)

code_depth/get_weights.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/bin/bash
+mkdir checkpoints
+cd checkpoints
+wget https://huggingface.co/depth-anything/Video-Depth-Anything-Small/resolve/main/video_depth_anything_vits.pth
+wget https://huggingface.co/depth-anything/Video-Depth-Anything-Large/resolve/main/video_depth_anything_vitl.pth

code_depth/large_files.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ./checkpoints/video_depth_anything_vitl.pth
2	+ ./checkpoints/video_depth_anything_vits.pth

code_depth/requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+numpy==1.23.1
+torch==2.1.1
+torchvision==0.16.1
+opencv-python
+matplotlib
+pillow
+imageio==2.19.3
+imageio-ffmpeg==0.4.7
+decord
+xformers==0.0.23
+einops==0.4.1
+easydict
+tqdm
+OpenEXR==3.3.1

code_depth/run.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (2025) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import numpy as np
+import os
+import torch
+from video_depth_anything.video_depth import VideoDepthAnything
+from utils.dc_utils import read_video_frames, save_video
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Video Depth Anything')
+    parser.add_argument('--input_video', type=str, default='./assets/example_videos/davis_rollercoaster.mp4')
+    parser.add_argument('--output_dir', type=str, default='./outputs')
+    parser.add_argument('--input_size', type=int, default=518)
+    parser.add_argument('--max_res', type=int, default=1280)
+    parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitl'])
+    parser.add_argument('--max_len', type=int, default=-1, help='maximum length of the input video, -1 means no limit')
+    parser.add_argument('--target_fps', type=int, default=-1, help='target fps of the input video, -1 means the original fps')
+    parser.add_argument('--fp32', action='store_true', help='model infer with torch.float32, default is torch.float16')
+    parser.add_argument('--grayscale', action='store_true', help='do not apply colorful palette')
+    parser.add_argument('--save_npz', action='store_true', help='save depths as npz')
+    parser.add_argument('--save_exr', action='store_true', help='save depths as exr')
+    args = parser.parse_args()
+    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model_configs = {
+        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+    }
+    video_depth_anything = VideoDepthAnything(**model_configs[args.encoder])
+    video_depth_anything.load_state_dict(torch.load(f'./checkpoints/video_depth_anything_{args.encoder}.pth', map_location='cpu'), strict=True)
+    video_depth_anything = video_depth_anything.to(DEVICE).eval()
+    frames, target_fps = read_video_frames(args.input_video, args.max_len, args.target_fps, args.max_res)
+    depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=args.input_size, device=DEVICE, fp32=args.fp32)
+    video_name = os.path.basename(args.input_video)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    processed_video_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_src.mp4')
+    depth_vis_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_vis.mp4')
+    save_video(frames, processed_video_path, fps=fps)
+    save_video(depths, depth_vis_path, fps=fps, is_depths=True, grayscale=args.grayscale)
+    if args.save_npz:
+        depth_npz_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_depths.npz')
+        np.savez_compressed(depth_npz_path, depths=depths)
+    if args.save_exr:
+        depth_exr_dir = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_depths_exr')
+        os.makedirs(depth_exr_dir, exist_ok=True)
+        import OpenEXR
+        import Imath
+        for i, depth in enumerate(depths):
+            output_exr = f"{depth_exr_dir}/frame_{i:05d}.exr"
+            header = OpenEXR.Header(depth.shape[1], depth.shape[0])
+            header["channels"] = {
+                "Z": Imath.Channel(Imath.PixelType(Imath.PixelType.FLOAT))
+            }
+            exr_file = OpenEXR.OutputFile(output_exr, header)
+            exr_file.writePixels({"Z": depth.tobytes()})
+            exr_file.close()

code_depth/run_images_rord.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright (2025) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import numpy as np
+import os
+import torch
+import os
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.cm as cm
+from PIL import Image
+from video_depth_anything.video_depth import VideoDepthAnything
+from utils.dc_utils import read_video_frames, save_video
+import tqdm
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Video Depth Anything')
+    parser.add_argument('--input_size', type=int, default=518)
+    parser.add_argument('--max_res', type=int, default=1280)
+    parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitl'])
+    parser.add_argument('--max_len', type=int, default=-1, help='maximum length of the input video, -1 means no limit')
+    parser.add_argument('--target_fps', type=int, default=-1, help='target fps of the input video, -1 means the original fps')
+    parser.add_argument('--fp32', action='store_true', help='model infer with torch.float32, default is torch.float16')
+    parser.add_argument('--grayscale', action='store_true', help='do not apply colorful palette')
+    parser.add_argument('--save_npz', action='store_true', help='save depths as npz')
+    parser.add_argument('--save_exr', action='store_true', help='save depths as exr')
+    args = parser.parse_args()
+    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model_configs = {
+        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+    }
+    video_depth_anything = VideoDepthAnything(**model_configs[args.encoder])
+    video_depth_anything.load_state_dict(torch.load(f'./checkpoints/video_depth_anything_{args.encoder}.pth', map_location='cpu'), strict=True)
+    video_depth_anything = video_depth_anything.to(DEVICE).eval()
+    # place input dir and out dir here
+    root_img_dir = "RORD/train/img"
+    root_gt_dir = "RORD/train/gt"
+    save_root_img_base = "RORD/val/img_depth"
+    save_root_gt_base = "RORD/val/gt_depth"
+    video_ids = sorted(os.listdir(root_img_dir))
+    for video_id in tqdm.tqdm(video_ids):
+        frame_dir = os.path.join(root_img_dir, video_id)
+        frame_paths = sorted([
+            os.path.join(frame_dir, fname) for fname in os.listdir(frame_dir)
+            if fname.endswith(".jpg") or fname.endswith(".png")
+        ])
+        frames = [cv2.imread(p)[:, :, ::-1] for p in frame_paths]
+        gt_path = frame_paths[0].replace("/img/", "/gt/")
+        gt_img = cv2.imread(gt_path)[:, :, ::-1]  # BGR to RGB
+        frames.append(gt_img)
+        resized_frames = []
+        max_res = 1280
+        for f in frames:
+            h, w = f.shape[:2]
+            if max(h, w) > max_res:
+                scale = max_res / max(h, w)
+                f = cv2.resize(f, (int(w * scale), int(h * scale)))
+            resized_frames.append(f)
+        resized_frames = np.stack(resized_frames, axis=0)
+        depths, _ = video_depth_anything.infer_video_depth(
+            resized_frames, 32, input_size=518, device=DEVICE, fp32=False
+        )
+        save_root_img = os.path.join(save_root_img_base, video_id)
+        save_root_gt = os.path.join(save_root_gt_base, video_id)
+        os.makedirs(save_root_img, exist_ok=True)
+        os.makedirs(save_root_gt, exist_ok=True)
+        colormap = np.array(cm.get_cmap("inferno").colors)
+        d_min, d_max = depths.min(), depths.max()
+        for i, path in enumerate(frame_paths):
+            fname = os.path.basename(path)
+            depth = depths[i]
+            depth_norm = ((depth - d_min) / (d_max - d_min + 1e-6) * 255).astype(np.uint8)
+            depth_vis = (colormap[depth_norm] * 255).astype(np.uint8)  # shape: (H, W, 3), uint8
+            img_path = os.path.join(save_root_img, fname)
+            Image.fromarray(depth_vis).save(img_path)
+            gt_depth = depths[-1]
+            gt_norm = ((gt_depth - d_min) / (d_max - d_min + 1e-6) * 255).astype(np.uint8)
+            gt_vis = (colormap[gt_norm] * 255).astype(np.uint8)
+            gt_save_path = os.path.join(save_root_gt, fname)
+            Image.fromarray(gt_vis).save(gt_save_path)

code_depth/run_single_image.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (2025) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# http://www.apache.org/licenses/LICENSE-2.0
+import os
+import numpy as np
+import torch
+import cv2
+import matplotlib.cm as cm
+from PIL import Image
+from video_depth_anything.video_depth import VideoDepthAnything
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description='Video Depth Anything')
+    parser.add_argument('--input_size', type=int, default=518)
+    parser.add_argument('--max_res', type=int, default=1280)
+    parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitl'])
+    parser.add_argument('--max_len', type=int, default=-1)
+    parser.add_argument('--target_fps', type=int, default=-1)
+    parser.add_argument('--fp32', action='store_true')
+    parser.add_argument('--grayscale', action='store_true')
+    parser.add_argument('--save_npz', action='store_true')
+    parser.add_argument('--save_exr', action='store_true')
+    args = parser.parse_args()
+    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model_configs = {
+        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+    }
+    video_depth_anything = VideoDepthAnything(**model_configs[args.encoder])
+    video_depth_anything.load_state_dict(
+        torch.load(f'./checkpoints/video_depth_anything_{args.encoder}.pth', map_location='cpu'),
+        strict=True
+    )
+    video_depth_anything = video_depth_anything.to(DEVICE).eval()
+    # your image input and output path
+    input_path = ""
+    output_path = ""
+    img = cv2.imread(input_path)[:, :, ::-1]
+    h, w = img.shape[:2]
+    if max(h, w) > args.max_res:
+        scale = args.max_res / max(h, w)
+        img = cv2.resize(img, (int(w * scale), int(h * scale)))
+    frame_tensor = np.stack([img], axis=0)
+    depths, _ = video_depth_anything.infer_video_depth(
+        frame_tensor, 32, input_size=518, device=DEVICE, fp32=False
+    )
+    depth = depths[0]
+    colormap = np.array(cm.get_cmap("inferno").colors)
+    d_min, d_max = depth.min(), depth.max()
+    depth_norm = ((depth - d_min) / (d_max - d_min + 1e-6) * 255).astype(np.uint8)
+    depth_vis = (colormap[depth_norm] * 255).astype(np.uint8)
+    Image.fromarray(depth_vis).save(output_path)
+    print(f"Saved depth map to: {output_path}")

code_depth/utils/dc_utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# This file is originally from DepthCrafter/depthcrafter/utils.py at main · Tencent/DepthCrafter
+# SPDX-License-Identifier: MIT License license
+#
+# This file may have been modified by ByteDance Ltd. and/or its affiliates on [date of modification]
+# Original file is released under [ MIT License license], with the full license text available at [https://github.com/Tencent/DepthCrafter?tab=License-1-ov-file].
+import numpy as np
+import matplotlib.cm as cm
+import imageio
+try:
+    from decord import VideoReader, cpu
+    DECORD_AVAILABLE = True
+except:
+    import cv2
+    DECORD_AVAILABLE = False
+def ensure_even(value):
+    return value if value % 2 == 0 else value + 1
+def read_video_frames(video_path, process_length, target_fps=-1, max_res=-1):
+    if DECORD_AVAILABLE:
+        vid = VideoReader(video_path, ctx=cpu(0))
+        original_height, original_width = vid.get_batch([0]).shape[1:3]
+        height = original_height
+        width = original_width
+        if max_res > 0 and max(height, width) > max_res:
+            scale = max_res / max(original_height, original_width)
+            height = ensure_even(round(original_height * scale))
+            width = ensure_even(round(original_width * scale))
+        vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
+        fps = vid.get_avg_fps() if target_fps == -1 else target_fps
+        stride = round(vid.get_avg_fps() / fps)
+        stride = max(stride, 1)
+        frames_idx = list(range(0, len(vid), stride))
+        if process_length != -1 and process_length < len(frames_idx):
+            frames_idx = frames_idx[:process_length]
+        frames = vid.get_batch(frames_idx).asnumpy()
+    else:
+        cap = cv2.VideoCapture(video_path)
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
+        original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        if max_res > 0 and max(original_height, original_width) > max_res:
+            scale = max_res / max(original_height, original_width)
+            height = round(original_height * scale)
+            width = round(original_width * scale)
+        fps = original_fps if target_fps < 0 else target_fps
+        stride = max(round(original_fps / fps), 1)
+        frames = []
+        frame_count = 0
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret or (process_length > 0 and frame_count >= process_length):
+                break
+            if frame_count % stride == 0:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
+                if max_res > 0 and max(original_height, original_width) > max_res:
+                    frame = cv2.resize(frame, (width, height))  # Resize frame
+                frames.append(frame)
+            frame_count += 1
+        cap.release()
+        frames = np.stack(frames, axis=0)
+    return frames, fps
+def save_video(frames, output_video_path, fps=10, is_depths=False, grayscale=False):
+    writer = imageio.get_writer(output_video_path, fps=fps, macro_block_size=1, codec='libx264', ffmpeg_params=['-crf', '18'])
+    if is_depths:
+        colormap = np.array(cm.get_cmap("inferno").colors)
+        d_min, d_max = frames.min(), frames.max()
+        for i in range(frames.shape[0]):
+            depth = frames[i]
+            depth_norm = ((depth - d_min) / (d_max - d_min) * 255).astype(np.uint8)
+            depth_vis = (colormap[depth_norm] * 255).astype(np.uint8) if not grayscale else depth_norm
+            writer.append_data(depth_vis)
+    else:
+        for i in range(frames.shape[0]):
+            writer.append_data(frames[i])
+    writer.close()

code_depth/utils/util.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (2025) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+def compute_scale_and_shift(prediction, target, mask, scale_only=False):
+    if scale_only:
+        return compute_scale(prediction, target, mask), 0
+    else:
+        return compute_scale_and_shift_full(prediction, target, mask)
+def compute_scale(prediction, target, mask):
+    # system matrix: A = [[a_00, a_01], [a_10, a_11]]
+    prediction = prediction.astype(np.float32)
+    target = target.astype(np.float32)
+    mask = mask.astype(np.float32)
+    a_00 = np.sum(mask * prediction * prediction)
+    a_01 = np.sum(mask * prediction)
+    a_11 = np.sum(mask)
+    # right hand side: b = [b_0, b_1]
+    b_0 = np.sum(mask * prediction * target)
+    x_0 = b_0 / (a_00 + 1e-6)
+    return x_0
+def compute_scale_and_shift_full(prediction, target, mask):
+    # system matrix: A = [[a_00, a_01], [a_10, a_11]]
+    prediction = prediction.astype(np.float32)
+    target = target.astype(np.float32)
+    mask = mask.astype(np.float32)
+    a_00 = np.sum(mask * prediction * prediction)
+    a_01 = np.sum(mask * prediction)
+    a_11 = np.sum(mask)
+    b_0 = np.sum(mask * prediction * target)
+    b_1 = np.sum(mask * target)
+    x_0 = 1
+    x_1 = 0
+    det = a_00 * a_11 - a_01 * a_01
+    if det != 0:
+        x_0 = (a_11 * b_0 - a_01 * b_1) / det
+        x_1 = (-a_01 * b_0 + a_00 * b_1) / det
+    return x_0, x_1
+def get_interpolate_frames(frame_list_pre, frame_list_post):
+    assert len(frame_list_pre) == len(frame_list_post)
+    min_w = 0.0
+    max_w = 1.0
+    step = (max_w - min_w) / (len(frame_list_pre)-1)
+    post_w_list = [min_w] + [i * step for i in range(1,len(frame_list_pre)-1)] + [max_w]
+    interpolated_frames = []
+    for i in range(len(frame_list_pre)):
+        interpolated_frames.append(frame_list_pre[i] * (1-post_w_list[i]) + frame_list_post[i] * post_w_list[i])
+    return interpolated_frames

code_depth/video_depth_anything/dinov2.py ADDED Viewed

	@@ -0,0 +1,415 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+logger = logging.getLogger("dinov2")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
+        w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        sqrt_N = math.sqrt(N)
+        sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+            scale_factor=(sx, sy),
+            # (int(w0), int(h0)), # to solve the upsampling shape issue
+            mode="bicubic",
+            antialias=self.interpolate_antialias
+        )
+        assert int(w0) == patch_pos_embed.shape[-2]
+        assert int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def DINOv2(model_name):
+    model_zoo = {
+        "vits": vit_small,
+        "vitb": vit_base,
+        "vitl": vit_large,
+        "vitg": vit_giant2
+    }
+    return model_zoo[model_name](
+        img_size=518,
+        patch_size=14,
+        init_values=1.0,
+        ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
+        block_chunks=0,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1
+    )

code_depth/video_depth_anything/dinov2_layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

code_depth/video_depth_anything/dinov2_layers/attention.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import memory_efficient_attention, unbind, fmha
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, "xFormers is required for nested tensors usage"
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

code_depth/video_depth_anything/dinov2_layers/block.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+from typing import Callable, List, Any, Tuple, Dict
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import fmha
+    from xformers.ops import scaled_index_add, index_select_cat
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

code_depth/video_depth_anything/dinov2_layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

code_depth/video_depth_anything/dinov2_layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

code_depth/video_depth_anything/dinov2_layers/mlp.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

code_depth/video_depth_anything/dinov2_layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

code_depth/video_depth_anything/dinov2_layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Callable, Optional
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+try:
+    from xformers.ops import SwiGLU
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

code_depth/video_depth_anything/dpt.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright (2025) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .util.blocks import FeatureFusionBlock, _make_scratch
+def _make_fusion_block(features, use_bn, size=None):
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        size=size,
+    )
+class ConvBlock(nn.Module):
+    def __init__(self, in_feature, out_feature):
+        super().__init__()
+        self.conv_block = nn.Sequential(
+            nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(out_feature),
+            nn.ReLU(True)
+        )
+    def forward(self, x):
+        return self.conv_block(x)
+class DPTHead(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        features=256,
+        use_bn=False,
+        out_channels=[256, 512, 1024, 1024],
+        use_clstoken=False
+    ):
+        super(DPTHead, self).__init__()
+        self.use_clstoken = use_clstoken
+        self.projects = nn.ModuleList([
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ) for out_channel in out_channels
+        ])
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        if use_clstoken:
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        nn.Linear(2 * in_channels, in_channels),
+                        nn.GELU()))
+        self.scratch = _make_scratch(
+            out_channels,
+            features,
+            groups=1,
+            expand=False,
+        )
+        self.scratch.stem_transpose = None
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        head_features_1 = features
+        head_features_2 = 32
+        self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
+        self.scratch.output_conv2 = nn.Sequential(
+            nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True),
+            nn.Identity(),
+        )
+    def forward(self, out_features, patch_h, patch_w):
+        out = []
+        for i, x in enumerate(out_features):
+            if self.use_clstoken:
+                x, cls_token = x[0], x[1]
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+            else:
+                x = x[0]
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        layer_1, layer_2, layer_3, layer_4 = out
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv1(path_1)
+        out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
+        out = self.scratch.output_conv2(out)
+        return out

code_depth/video_depth_anything/dpt_temporal.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright (2025) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from .dpt import DPTHead
+from .motion_module.motion_module import TemporalModule
+from easydict import EasyDict
+class DPTHeadTemporal(DPTHead):
+    def __init__(self,
+        in_channels,
+        features=256,
+        use_bn=False,
+        out_channels=[256, 512, 1024, 1024],
+        use_clstoken=False,
+        num_frames=32,
+        pe='ape'
+    ):
+        super().__init__(in_channels, features, use_bn, out_channels, use_clstoken)
+        assert num_frames > 0
+        motion_module_kwargs = EasyDict(num_attention_heads                = 8,
+                                        num_transformer_block              = 1,
+                                        num_attention_blocks               = 2,
+                                        temporal_max_len                   = num_frames,
+                                        zero_initialize                    = True,
+                                        pos_embedding_type                 = pe)
+        self.motion_modules = nn.ModuleList([
+            TemporalModule(in_channels=out_channels[2],
+                           **motion_module_kwargs),
+            TemporalModule(in_channels=out_channels[3],
+                           **motion_module_kwargs),
+            TemporalModule(in_channels=features,
+                           **motion_module_kwargs),
+            TemporalModule(in_channels=features,
+                           **motion_module_kwargs)
+        ])
+    def forward(self, out_features, patch_h, patch_w, frame_length, micro_batch_size=4):
+        out = []
+        for i, x in enumerate(out_features):
+            if self.use_clstoken:
+                x, cls_token = x[0], x[1]
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+            else:
+                x = x[0]
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w)).contiguous()
+            B, T = x.shape[0] // frame_length, frame_length
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        layer_1, layer_2, layer_3, layer_4 = out
+        B, T = layer_1.shape[0] // frame_length, frame_length
+        layer_3 = self.motion_modules[0](layer_3.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None).permute(0, 2, 1, 3, 4).flatten(0, 1)
+        layer_4 = self.motion_modules[1](layer_4.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None).permute(0, 2, 1, 3, 4).flatten(0, 1)
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        path_4 = self.motion_modules[2](path_4.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None).permute(0, 2, 1, 3, 4).flatten(0, 1)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+        path_3 = self.motion_modules[3](path_3.unflatten(0, (B, T)).permute(0, 2, 1, 3, 4), None, None).permute(0, 2, 1, 3, 4).flatten(0, 1)
+        batch_size = layer_1_rn.shape[0]
+        if batch_size <= micro_batch_size or batch_size % micro_batch_size != 0:
+            path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+            path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+            out = self.scratch.output_conv1(path_1)
+            out = F.interpolate(
+                out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True
+            )
+            ori_type = out.dtype
+            with torch.autocast(device_type="cuda", enabled=False):
+                out = self.scratch.output_conv2(out.float())
+            return out.to(ori_type)
+        else:
+            ret = []
+            for i in range(0, batch_size, micro_batch_size):
+                path_2 = self.scratch.refinenet2(path_3[i:i + micro_batch_size], layer_2_rn[i:i + micro_batch_size], size=layer_1_rn[i:i + micro_batch_size].shape[2:])
+                path_1 = self.scratch.refinenet1(path_2, layer_1_rn[i:i + micro_batch_size])
+                out = self.scratch.output_conv1(path_1)
+                out = F.interpolate(
+                    out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True
+                )
+                ori_type = out.dtype
+                with torch.autocast(device_type="cuda", enabled=False):
+                    out = self.scratch.output_conv2(out.float())
+                ret.append(out.to(ori_type))
+            return torch.cat(ret, dim=0)

code_depth/video_depth_anything/motion_module/attention.py ADDED Viewed

	@@ -0,0 +1,429 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple
+import torch
+import torch.nn.functional as F
+from torch import nn
+try:
+    import xformers
+    import xformers.ops
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    print("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class CrossAttention(nn.Module):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias=False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        added_kv_proj_dim: Optional[int] = None,
+        norm_num_groups: Optional[int] = None,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.upcast_efficient_attention = False
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+        self._slice_size = None
+        self._use_memory_efficient_attention_xformers = False
+        self.added_kv_proj_dim = added_kv_proj_dim
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(num_channels=inner_dim, num_groups=norm_num_groups, eps=1e-5, affine=True)
+        else:
+            self.group_norm = None
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
+        self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(inner_dim, query_dim))
+        self.to_out.append(nn.Dropout(dropout))
+    def reshape_heads_to_batch_dim(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size).contiguous()
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size).contiguous()
+        return tensor
+    def reshape_heads_to_4d(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size).contiguous()
+        return tensor
+    def reshape_batch_dim_to_heads(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim).contiguous()
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size).contiguous()
+        return tensor
+    def reshape_4d_to_heads(self, tensor):
+        batch_size, seq_len, head_size, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, dim * head_size).contiguous()
+        return tensor
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and slice_size > self.sliceable_head_dim:
+            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
+        self._slice_size = slice_size
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        encoder_hidden_states = encoder_hidden_states
+        if self.group_norm is not None:
+            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = self.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = self.reshape_heads_to_batch_dim(query)
+        if self.added_kv_proj_dim is not None:
+            key = self.to_k(hidden_states)
+            value = self.to_v(hidden_states)
+            encoder_hidden_states_key_proj = self.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = self.add_v_proj(encoder_hidden_states)
+            key = self.reshape_heads_to_batch_dim(key)
+            value = self.reshape_heads_to_batch_dim(value)
+            encoder_hidden_states_key_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_key_proj)
+            encoder_hidden_states_value_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_value_proj)
+            key = torch.concat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.concat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+            key = self.to_k(encoder_hidden_states)
+            value = self.to_v(encoder_hidden_states)
+            key = self.reshape_heads_to_batch_dim(key)
+            value = self.reshape_heads_to_batch_dim(value)
+        if attention_mask is not None:
+            if attention_mask.shape[-1] != query.shape[1]:
+                target_length = query.shape[1]
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
+        # attention, what we cannot get enough of
+        if XFORMERS_AVAILABLE and self._use_memory_efficient_attention_xformers:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                hidden_states = self._attention(query, key, value, attention_mask)
+            else:
+                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        return hidden_states
+    def _attention(self, query, key, value, attention_mask=None):
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+        attention_scores = torch.baddbmm(
+            torch.empty(query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device),
+            query,
+            key.transpose(-1, -2),
+            beta=0,
+            alpha=self.scale,
+        )
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+        attention_probs = attention_scores.softmax(dim=-1)
+        # cast back to the original dtype
+        attention_probs = attention_probs.to(value.dtype)
+        # compute attention output
+        hidden_states = torch.bmm(attention_probs, value)
+        # reshape hidden_states
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+    def _sliced_attention(self, query, key, value, sequence_length, dim, attention_mask):
+        batch_size_attention = query.shape[0]
+        hidden_states = torch.zeros(
+            (batch_size_attention, sequence_length, dim // self.heads), device=query.device, dtype=query.dtype
+        )
+        slice_size = self._slice_size if self._slice_size is not None else hidden_states.shape[0]
+        for i in range(hidden_states.shape[0] // slice_size):
+            start_idx = i * slice_size
+            end_idx = (i + 1) * slice_size
+            query_slice = query[start_idx:end_idx]
+            key_slice = key[start_idx:end_idx]
+            if self.upcast_attention:
+                query_slice = query_slice.float()
+                key_slice = key_slice.float()
+            attn_slice = torch.baddbmm(
+                torch.empty(slice_size, query.shape[1], key.shape[1], dtype=query_slice.dtype, device=query.device),
+                query_slice,
+                key_slice.transpose(-1, -2),
+                beta=0,
+                alpha=self.scale,
+            )
+            if attention_mask is not None:
+                attn_slice = attn_slice + attention_mask[start_idx:end_idx]
+            if self.upcast_softmax:
+                attn_slice = attn_slice.float()
+            attn_slice = attn_slice.softmax(dim=-1)
+            # cast back to the original dtype
+            attn_slice = attn_slice.to(value.dtype)
+            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+            hidden_states[start_idx:end_idx] = attn_slice
+        # reshape hidden_states
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+    def _memory_efficient_attention_xformers(self, query, key, value, attention_mask):
+        if self.upcast_efficient_attention:
+            org_dtype = query.dtype
+            query = query.float()
+            key = key.float()
+            value = value.float()
+            if attention_mask is not None:
+                attention_mask = attention_mask.float()
+        hidden_states = self._memory_efficient_attention_split(query, key, value, attention_mask)
+        if self.upcast_efficient_attention:
+            hidden_states = hidden_states.to(org_dtype)
+        hidden_states = self.reshape_4d_to_heads(hidden_states)
+        return hidden_states
+        # print("Errror: no xformers")
+        # raise NotImplementedError
+    def _memory_efficient_attention_split(self, query, key, value, attention_mask):
+        batch_size = query.shape[0]
+        max_batch_size = 65535
+        num_batches = (batch_size + max_batch_size - 1) // max_batch_size
+        results = []
+        for i in range(num_batches):
+            start_idx = i * max_batch_size
+            end_idx = min((i + 1) * max_batch_size, batch_size)
+            query_batch = query[start_idx:end_idx]
+            key_batch = key[start_idx:end_idx]
+            value_batch = value[start_idx:end_idx]
+            if attention_mask is not None:
+                attention_mask_batch = attention_mask[start_idx:end_idx]
+            else:
+                attention_mask_batch = None
+            result = xformers.ops.memory_efficient_attention(query_batch, key_batch, value_batch, attn_bias=attention_mask_batch)
+            results.append(result)
+        full_result = torch.cat(results, dim=0)
+        return full_result
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out))
+    def forward(self, hidden_states):
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+class GELU(nn.Module):
+    r"""
+    GELU activation function
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+    def gelu(self, gate):
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+# feedforward
+class GEGLU(nn.Module):
+    r"""
+    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def gelu(self, gate):
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class ApproximateGELU(nn.Module):
+    """
+    The approximate form of Gaussian Error Linear Unit (GELU)
+    For more details, see section 2: https://arxiv.org/abs/1606.08415
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+    def forward(self, x):
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device, dtype=torch.float32)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2).contiguous())
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2).contiguous())
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(2)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(2)
+    return xq_out.type_as(xq), xk_out.type_as(xk)

code_depth/video_depth_anything/motion_module/motion_module.py ADDED Viewed

	@@ -0,0 +1,297 @@

+# This file is originally from AnimateDiff/animatediff/models/motion_module.py at main · guoyww/AnimateDiff
+# SPDX-License-Identifier: Apache-2.0 license
+#
+# This file may have been modified by ByteDance Ltd. and/or its affiliates on [date of modification]
+# Original file was released under [ Apache-2.0 license], with the full license text available at [https://github.com/guoyww/AnimateDiff?tab=Apache-2.0-1-ov-file#readme].
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .attention import CrossAttention, FeedForward, apply_rotary_emb, precompute_freqs_cis
+from einops import rearrange, repeat
+import math
+try:
+    import xformers
+    import xformers.ops
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    print("xFormers not available")
+    XFORMERS_AVAILABLE = False
+def zero_module(module):
+    # Zero out the parameters of a module and return it.
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+class TemporalModule(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        num_attention_heads                = 8,
+        num_transformer_block              = 2,
+        num_attention_blocks               = 2,
+        norm_num_groups                    = 32,
+        temporal_max_len                   = 32,
+        zero_initialize                    = True,
+        pos_embedding_type                 = "ape",
+    ):
+        super().__init__()
+        self.temporal_transformer = TemporalTransformer3DModel(
+            in_channels=in_channels,
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=in_channels // num_attention_heads,
+            num_layers=num_transformer_block,
+            num_attention_blocks=num_attention_blocks,
+            norm_num_groups=norm_num_groups,
+            temporal_max_len=temporal_max_len,
+            pos_embedding_type=pos_embedding_type,
+        )
+        if zero_initialize:
+            self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)
+    def forward(self, input_tensor, encoder_hidden_states, attention_mask=None):
+        hidden_states = input_tensor
+        hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask)
+        output = hidden_states
+        return output
+class TemporalTransformer3DModel(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        num_attention_heads,
+        attention_head_dim,
+        num_layers,
+        num_attention_blocks               = 2,
+        norm_num_groups                    = 32,
+        temporal_max_len                   = 32,
+        pos_embedding_type                 = "ape",
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                TemporalTransformerBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    num_attention_blocks=num_attention_blocks,
+                    temporal_max_len=temporal_max_len,
+                    pos_embedding_type=pos_embedding_type,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+        video_length = hidden_states.shape[2]
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        batch, channel, height, width = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        inner_dim = hidden_states.shape[1]
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim).contiguous()
+        hidden_states = self.proj_in(hidden_states)
+        # Transformer Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, encoder_hidden_states=encoder_hidden_states, video_length=video_length, attention_mask=attention_mask)
+        # output
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+        output = hidden_states + residual
+        output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
+        return output
+class TemporalTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_attention_heads,
+        attention_head_dim,
+        num_attention_blocks               = 2,
+        temporal_max_len                   = 32,
+        pos_embedding_type                 = "ape",
+    ):
+        super().__init__()
+        self.attention_blocks = nn.ModuleList(
+            [
+                TemporalAttention(
+                        query_dim=dim,
+                        heads=num_attention_heads,
+                        dim_head=attention_head_dim,
+                        temporal_max_len=temporal_max_len,
+                        pos_embedding_type=pos_embedding_type,
+                )
+                for i in range(num_attention_blocks)
+            ]
+        )
+        self.norms = nn.ModuleList(
+            [
+                nn.LayerNorm(dim)
+                for i in range(num_attention_blocks)
+            ]
+        )
+        self.ff = FeedForward(dim, dropout=0.0, activation_fn="geglu")
+        self.ff_norm = nn.LayerNorm(dim)
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
+        for attention_block, norm in zip(self.attention_blocks, self.norms):
+            norm_hidden_states = norm(hidden_states)
+            hidden_states = attention_block(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                video_length=video_length,
+                attention_mask=attention_mask,
+            ) + hidden_states
+        hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
+        output = hidden_states
+        return output
+class PositionalEncoding(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        dropout = 0.,
+        max_len = 32
+    ):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
+        pe = torch.zeros(1, max_len, d_model)
+        pe[0, :, 0::2] = torch.sin(position * div_term)
+        pe[0, :, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1)].to(x.dtype)
+        return self.dropout(x)
+class TemporalAttention(CrossAttention):
+    def __init__(
+            self,
+            temporal_max_len                   = 32,
+            pos_embedding_type                 = "ape",
+            *args, **kwargs
+        ):
+        super().__init__(*args, **kwargs)
+        self.pos_embedding_type = pos_embedding_type
+        self._use_memory_efficient_attention_xformers = True
+        self.pos_encoder = None
+        self.freqs_cis = None
+        if self.pos_embedding_type == "ape":
+            self.pos_encoder = PositionalEncoding(
+                kwargs["query_dim"],
+                dropout=0.,
+                max_len=temporal_max_len
+            )
+        elif self.pos_embedding_type == "rope":
+            self.freqs_cis = precompute_freqs_cis(
+                kwargs["query_dim"],
+                temporal_max_len
+            )
+        else:
+            raise NotImplementedError
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
+        d = hidden_states.shape[1]
+        hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+        if self.pos_encoder is not None:
+            hidden_states = self.pos_encoder(hidden_states)
+        encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b d) n c", d=d) if encoder_hidden_states is not None else encoder_hidden_states
+        if self.group_norm is not None:
+            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = self.to_q(hidden_states)
+        dim = query.shape[-1]
+        if self.added_kv_proj_dim is not None:
+            raise NotImplementedError
+        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        key = self.to_k(encoder_hidden_states)
+        value = self.to_v(encoder_hidden_states)
+        if self.freqs_cis is not None:
+            seq_len = query.shape[1]
+            freqs_cis = self.freqs_cis[:seq_len].to(query.device)
+            query, key = apply_rotary_emb(query, key, freqs_cis)
+        if attention_mask is not None:
+            if attention_mask.shape[-1] != query.shape[1]:
+                target_length = query.shape[1]
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
+        use_memory_efficient = XFORMERS_AVAILABLE and self._use_memory_efficient_attention_xformers
+        if use_memory_efficient and (dim // self.heads) % 8 != 0:
+            # print('Warning: the dim {} cannot be divided by 8. Fall into normal attention'.format(dim // self.heads))
+            use_memory_efficient = False
+        # attention, what we cannot get enough of
+        if use_memory_efficient:
+            query = self.reshape_heads_to_4d(query)
+            key = self.reshape_heads_to_4d(key)
+            value = self.reshape_heads_to_4d(value)
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            query = self.reshape_heads_to_batch_dim(query)
+            key = self.reshape_heads_to_batch_dim(key)
+            value = self.reshape_heads_to_batch_dim(value)
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                hidden_states = self._attention(query, key, value, attention_mask)
+            else:
+                raise NotImplementedError
+                # hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
+        return hidden_states

code_depth/video_depth_anything/util/blocks.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import torch.nn as nn
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(
+            in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+        )
+    return scratch
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module."""
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups = 1
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+        )
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+        )
+        if self.bn is True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn is True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn is True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block."""
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=None,
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups = 1
+        self.expand = expand
+        out_features = features
+        if self.expand is True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(
+            features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1
+        )
+        self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+        self.size = size
+    def forward(self, *xs, size=None):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+        output = self.resConfUnit2(output)
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+        output = nn.functional.interpolate(
+            output.contiguous(), **modifier, mode="bilinear", align_corners=self.align_corners
+        )
+        output = self.out_conv(output)
+        return output

code_depth/video_depth_anything/util/transform.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import numpy as np
+import cv2
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0])
+        # resize sample
+        sample["image"] = cv2.resize(sample["image"], (width, height), interpolation=self.__image_interpolation_method)
+        if self.__resize_target:
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST)
+            if "mask" in sample:
+                sample["mask"] = cv2.resize(sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST)
+        return sample
+class NormalizeImage(object):
+    """Normlize image by given mean and std.
+    """
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input.
+    """
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        return sample

code_depth/video_depth_anything/video_depth.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (2025) Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torchvision.transforms import Compose
+import cv2
+from tqdm import tqdm
+import numpy as np
+import gc
+from .dinov2 import DINOv2
+from .dpt_temporal import DPTHeadTemporal
+from .util.transform import Resize, NormalizeImage, PrepareForNet
+from utils.util import compute_scale_and_shift, get_interpolate_frames
+# infer settings, do not change
+INFER_LEN = 32
+OVERLAP = 10
+KEYFRAMES = [0,12,24,25,26,27,28,29,30,31]
+INTERP_LEN = 8
+class VideoDepthAnything(nn.Module):
+    def __init__(
+        self,
+        encoder='vitl',
+        features=256,
+        out_channels=[256, 512, 1024, 1024],
+        use_bn=False,
+        use_clstoken=False,
+        num_frames=32,
+        pe='ape'
+    ):
+        super(VideoDepthAnything, self).__init__()
+        self.intermediate_layer_idx = {
+            'vits': [2, 5, 8, 11],
+            'vitl': [4, 11, 17, 23]
+        }
+        self.encoder = encoder
+        self.pretrained = DINOv2(model_name=encoder)
+        self.head = DPTHeadTemporal(self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken, num_frames=num_frames, pe=pe)
+    def forward(self, x):
+        B, T, C, H, W = x.shape
+        patch_h, patch_w = H // 14, W // 14
+        features = self.pretrained.get_intermediate_layers(x.flatten(0,1), self.intermediate_layer_idx[self.encoder], return_class_token=True)
+        depth = self.head(features, patch_h, patch_w, T)
+        depth = F.interpolate(depth, size=(H, W), mode="bilinear", align_corners=True)
+        depth = F.relu(depth)
+        return depth.squeeze(1).unflatten(0, (B, T)) # return shape [B, T, H, W]
+    def infer_video_depth(self, frames, target_fps, input_size=518, device='cuda', fp32=False):
+        frame_height, frame_width = frames[0].shape[:2]
+        ratio = max(frame_height, frame_width) / min(frame_height, frame_width)
+        if ratio > 1.78:  # we recommend to process video with ratio smaller than 16:9 due to memory limitation
+            input_size = int(input_size * 1.777 / ratio)
+            input_size = round(input_size / 14) * 14
+        transform = Compose([
+            Resize(
+                width=input_size,
+                height=input_size,
+                resize_target=False,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=14,
+                resize_method='lower_bound',
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            PrepareForNet(),
+        ])
+        frame_list = [frames[i] for i in range(frames.shape[0])]
+        frame_step = INFER_LEN - OVERLAP
+        org_video_len = len(frame_list)
+        append_frame_len = (frame_step - (org_video_len % frame_step)) % frame_step + (INFER_LEN - frame_step)
+        frame_list = frame_list + [frame_list[-1].copy()] * append_frame_len
+        depth_list = []
+        pre_input = None
+        for frame_id in tqdm(range(0, org_video_len, frame_step)):
+            cur_list = []
+            for i in range(INFER_LEN):
+                cur_list.append(torch.from_numpy(transform({'image': frame_list[frame_id+i].astype(np.float32) / 255.0})['image']).unsqueeze(0).unsqueeze(0))
+            cur_input = torch.cat(cur_list, dim=1).to(device)
+            if pre_input is not None:
+                cur_input[:, :OVERLAP, ...] = pre_input[:, KEYFRAMES, ...]
+            with torch.no_grad():
+                with torch.autocast(device_type=device, enabled=(not fp32)):
+                    depth = self.forward(cur_input) # depth shape: [1, T, H, W]
+            depth = depth.to(cur_input.dtype)
+            depth = F.interpolate(depth.flatten(0,1).unsqueeze(1), size=(frame_height, frame_width), mode='bilinear', align_corners=True)
+            depth_list += [depth[i][0].cpu().numpy() for i in range(depth.shape[0])]
+            pre_input = cur_input
+        del frame_list
+        gc.collect()
+        depth_list_aligned = []
+        ref_align = []
+        align_len = OVERLAP - INTERP_LEN
+        kf_align_list = KEYFRAMES[:align_len]
+        for frame_id in range(0, len(depth_list), INFER_LEN):
+            if len(depth_list_aligned) == 0:
+                depth_list_aligned += depth_list[:INFER_LEN]
+                for kf_id in kf_align_list:
+                    ref_align.append(depth_list[frame_id+kf_id])
+            else:
+                curr_align = []
+                for i in range(len(kf_align_list)):
+                    curr_align.append(depth_list[frame_id+i])
+                scale, shift = compute_scale_and_shift(np.concatenate(curr_align),
+                                                       np.concatenate(ref_align),
+                                                       np.concatenate(np.ones_like(ref_align)==1))
+                pre_depth_list = depth_list_aligned[-INTERP_LEN:]
+                post_depth_list = depth_list[frame_id+align_len:frame_id+OVERLAP]
+                for i in range(len(post_depth_list)):
+                    post_depth_list[i] = post_depth_list[i] * scale + shift
+                    post_depth_list[i][post_depth_list[i]<0] = 0
+                depth_list_aligned[-INTERP_LEN:] = get_interpolate_frames(pre_depth_list, post_depth_list)
+                for i in range(OVERLAP, INFER_LEN):
+                    new_depth = depth_list[frame_id+i] * scale + shift
+                    new_depth[new_depth<0] = 0
+                    depth_list_aligned.append(new_depth)
+                ref_align = ref_align[:1]
+                for kf_id in kf_align_list[1:]:
+                    new_depth = depth_list[frame_id+kf_id] * scale + shift
+                    new_depth[new_depth<0] = 0
+                    ref_align.append(new_depth)
+        depth_list = depth_list_aligned
+        return np.stack(depth_list[:org_video_len], axis=0), target_fps

code_edit/.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

code_edit/Flux_fill_d2i.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+from diffusers.pipelines.flux.pipeline_flux_fill_unmasked_image_condition_version import FluxFillPipeline_token12_depth as FluxFillPipeline
+from diffusers.utils import load_image
+import os, glob
+import numpy as np
+import cv2
+from PIL import Image
+image_path = ["example_data/I-210618_I01001_W01_I-210618_I01001_W01_F0153_img.jpg"]
+pipe = FluxFillPipeline.from_pretrained("black-forest-labs/FLUX.1-Fill-dev", torch_dtype=torch.bfloat16).to("cuda")
+pipe.load_lora_weights("stage2/checkpoint-20000")
+for image_ep in image_path:
+    image = Image.open(image_ep)
+    mask = Image.new("L", image.size, 0)  # place_hold
+    depth_path = image_ep.replace("_img.jpg", "_depth_img.png")
+    depth_image = Image.open(depth_path)
+    depth = Image.open(depth_path.replace("_img", "_img_fill_in"))
+    image_name = os.path.basename(image_ep)
+    orig_w, orig_h = image.size
+    w, h = image.size
+    MAX_SIZE = 1024
+    if max(w, h) > MAX_SIZE:
+            factor = MAX_SIZE / max(w, h)
+            w = int(factor * w)
+            h = int(factor * h)
+    width, height = map(lambda x: x - x % 64, (w, h))
+    # # Resize to 1024 × 1024
+    target_size = (width, height)
+    # target_size = (1024, 1024)
+    # image_resized = image.resize(target_size, Image.BICUBIC)
+    # mask_resized = mask.resize(target_size, Image.NEAREST)
+    # depth_resized = depth.resize(target_size, Image.BICUBIC)
+    # depth_image_resized = depth_image.resize(target_size, Image.BICUBIC)
+    image = pipe(
+        prompt="A beautiful scene",
+        image=image,
+        mask_image=mask,
+        width=target_size[0],
+        height=target_size[1],
+        guidance_scale=30,
+        num_inference_steps=50,
+        max_sequence_length=512,
+        generator=torch.Generator("cpu").manual_seed(0),
+        depth=depth,
+        depth_image=depth_image,
+    ).images[0]
+    image_final = image.resize((orig_w * 3, orig_h), Image.BICUBIC)
+    output_dir = "./test_images/"
+    os.makedirs(output_dir, exist_ok=True)
+    image_final.save(os.path.join(output_dir,image_name))

code_edit/Flux_fill_infer_depth.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+from diffusers.pipelines.flux.pipeline_flux_fill_unmasked_image_condition_version import FluxFillPipeline_token12_depth_only as FluxFillPipeline
+from diffusers.utils import load_image
+import os, glob
+import numpy as np
+import cv2
+from PIL import Image, ImageOps
+image_path = ["example_data/I-210618_I01001_W01_I-210618_I01001_W01_F0153_img.jpg"]
+pipe = FluxFillPipeline.from_pretrained("black-forest-labs/FLUX.1-Fill-dev", torch_dtype=torch.bfloat16).to("cuda")
+pipe.load_lora_weights("stage1/checkpoint-4800")
+for image_ep in image_path:
+    mask_path = image_ep.replace("_img.jpg","_mask.png")
+    image = Image.open(image_ep)  # place_hold
+    depth = Image.open(image_ep.replace("_img.jpg",
+                                                "_depth_img.png"))
+    image_name = os.path.basename(image_ep)
+    mask = Image.open(mask_path).convert("L")
+    mask = ImageOps.invert(mask)    # inverse rord_mask
+    # mask_np = np.array(mask)
+    # # mask dilation
+    # dilation_px = 32
+    # kernel = np.ones((3, 3), np.uint8)
+    # iterations = dilation_px // 2
+    # dilated_mask = cv2.dilate(mask_np, kernel, iterations=iterations)
+    # mask = Image.fromarray(dilated_mask)
+    orig_w, orig_h = image.size
+    # Resize to 1024 × 1024
+    # target_size = (1024, 1024)
+    # image_resized = image.resize(target_size, Image.BICUBIC)
+    # mask_resized = mask.resize(target_size, Image.NEAREST)
+    # depth_resized = depth.resize(target_size, Image.BICUBIC)
+    w, h = image.size
+    MAX_SIZE = 1024
+    if max(w, h) > MAX_SIZE:
+            factor = MAX_SIZE / max(w, h)
+            w = int(factor * w)
+            h = int(factor * h)
+    width, height = map(lambda x: x - x % 64, (w, h))
+    image_out = pipe(
+        prompt="A beautiful scene",
+        image=image,
+        mask_image=mask,
+        width=width,
+        height=height,
+        guidance_scale=30,
+        num_inference_steps=50,
+        max_sequence_length=512,
+        generator=torch.Generator("cpu").manual_seed(0),
+        depth=depth
+    ).images[0]
+    image_final = image_out.resize((orig_w, orig_h), Image.BICUBIC)
+    output_dir = "./depth_fillin_results"
+    os.makedirs(output_dir, exist_ok=True)
+    image_final.save(os.path.join(output_dir, image_name))

code_edit/README.md ADDED Viewed

	@@ -0,0 +1,93 @@

+The official implementation of the **NeurIPS 2025** paper:
+<div align="center">
+<h1>
+<b>
+GeoRemover: Removing Objects and Their Causal Visual Artifacts, NeurIPS, 2025 (Spotlight)
+</b>
+</h1>
+</div>
+<p align="center"><img src="docs/teaser.png" width="800"/></p>
+> [**GeoRemover: Removing Objects and Their Causal Visual Artifacts**](https://arxiv.org/abs/2509.18538)
+>
+> Zixin Zhu, Haoxiang Li, Xuelu Feng, He Wu, Chunming Qiao, Junsong Yuan
+> **Abstract:** *Towards intelligent image editing, object removal should eliminate both the target object and its causal visual artifacts, such as shadows and reflections. However, existing image appearance-based methods either follow strictly mask-aligned training and fail to remove these casual effects which are not explicitly masked, or adopt loosely mask-aligned strategies that lack controllability and may unintentionally over-erase other objects. We identify that these limitations stem from ignoring the causal relationship between an object’s geometry presence and its visual effects. To address this limitation, we propose a geometry-aware two-stage framework that decouples object removal into (1) geometry removal and (2) appearance rendering. In the first stage, we remove the object directly from the geometry (e.g., depth) using strictly mask-aligned supervision, enabling structure-aware editing with strong geometric constraints. In the second stage, we render a photorealistic RGB image conditioned on the updated geometry, where causal visual effects are considered implicitly as a result of the modified 3D geometry. To guide learning in the geometry removal stage, we introduce a preference-driven objective based on positive and negative sample pairs, encouraging the model to remove objects as well as their causal visual artifacts while avoiding new structural insertions. Extensive experiments demonstrate that our method achieves state-of-the-art performance in removing both objects and their associated artifacts on two popular benchmarks.*
+### Installing the dependencies
+Before running the scripts, make sure to install the library's training dependencies:
+**Important**
+```bash
+bash env.sh
+```
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+```bash
+accelerate config
+```
+Or for a default accelerate configuration without answering questions about your environment
+```bash
+accelerate config default
+```
+### Data prepare
+Download the images on [RORD](https://github.com/Forty-lock/RORD) and generate depth maps with [Video-Depth-Anythingv2](https://github.com/DepthAnything/Video-Depth-Anything). (The code for VideoDepthAnything v2 can be found in the same repository, on the `depth` branch, using the [script](https://github.com/buxiangzhiren/GeoRemover/blob/depth/run_images_rord.py))
+### Training
+You should build your own *train_images_and_rord_masks.csv* first. The file in the repo is not the full RORD—it's just an example.
+For stage1:geometry removal
+```bash
+bash train_stage1.sh
+```
+For stage2:appearance rendering
+```bash
+bash train_stage2.sh
+```
+### Inference
+First, use https://github.com/buxiangzhiren/GeoRemover/blob/depth/run_single_image.py to get the depth of a image
+For stage1:geometry removal
+```bash
+python Flux_fill_infer_depth.py
+```
+For stage2:appearance rendering
+```bash
+python Flux_fill_d2i.py
+```
+### Checkpoints
+Hugging Face:
+[stage1:geometry removal and stage2:appearance rendering](https://huggingface.co/buxiangzhiren/GeoRemover)
+Google drive:
+[stage1:geometry removal](https://drive.google.com/file/d/1y6vnxqnFTiO6sxoKDBkvFbAeniHFka89/view?usp=sharing)
+ and [stage2:appearance rendering](https://drive.google.com/file/d/1U8rp1hqOswQB-0T0fh2aDQu-o1GLfd6E/view?usp=sharing)
+###  Acknowledgement
+This repo is based on [RORD](https://github.com/Forty-lock/RORD), [FLUX.1-Fill-dev](https://huggingface.co/black-forest-labs/FLUX.1-Fill-dev) and [Video-Depth-Anythingv2](https://github.com/DepthAnything/Video-Depth-Anything). Thanks for their wonderful works.
+### Citation
+```
+@misc{zhu2025georemoverremovingobjectscausal,
+      title={GeoRemover: Removing Objects and Their Causal Visual Artifacts},
+      author={Zixin Zhu and Haoxiang Li and Xuelu Feng and He Wu and Chunming Qiao and Junsong Yuan},
+      year={2025},
+      eprint={2509.18538},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2509.18538},
+}
+```