Spaces:

tencent
/

Hunyuan3D-2mv

Running on Zero

App Files Files Community

ZeqiangLai commited on 5 days ago

Commit

b530233

2 Parent(s): 6f34049 c918288

Merge branch 'main' of https://huggingface.co/spaces/tencent/Hunyuan3D-2mv

Browse files

Files changed (8) hide show

gradio_app.py +74 -30
hy3dgen/shapegen/__init__.py +1 -1
hy3dgen/shapegen/models/__init__.py +1 -1
hy3dgen/shapegen/models/conditioner.py +12 -104
hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py +3 -12
hy3dgen/shapegen/pipelines.py +65 -181
hy3dgen/shapegen/postprocessors.py +1 -4
hy3dgen/shapegen/preprocessors.py +6 -55

gradio_app.py CHANGED Viewed

@@ -1,10 +1,23 @@
 import os
 import random
 import shutil
 import time
 from glob import glob
 from pathlib import Path
-import uuid
 import gradio as gr
 import torch
@@ -12,6 +25,7 @@ import trimesh
 import uvicorn
 from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
 from hy3dgen.shapegen.utils import logger
@@ -28,6 +42,7 @@ if True:
     print('install custom')
     subprocess.run(shlex.split("pip install custom_rasterizer-0.1-cp310-cp310-linux_x86_64.whl"), check=True)
 def get_example_img_list():
     print('Loading example img list ...')
     return sorted(glob('./assets/example_images/**/*.png', recursive=True))
@@ -47,7 +62,7 @@ def get_example_mv_list():
     root = './assets/example_mv_images'
     for mv_dir in os.listdir(root):
         view_list = []
-        for view in ['正视图', '背视图', '左视图', '右视图']:
             path = os.path.join(root, mv_dir, f'{view}.png')
             if os.path.exists(path):
                 view_list.append(path)
@@ -57,18 +72,6 @@ def get_example_mv_list():
     return mv_list
-# def gen_save_folder(max_size=60):
-#     os.makedirs(SAVE_DIR, exist_ok=True)
-#     exists = set(int(_) for _ in os.listdir(SAVE_DIR) if _.isdigit())
-#     cur_id = min(set(range(max_size)) - exists) if len(exists) < max_size else -1
-#     if os.path.exists(f"{SAVE_DIR}/{(cur_id + 1) % max_size}"):
-#         shutil.rmtree(f"{SAVE_DIR}/{(cur_id + 1) % max_size}")
-#         print(f"remove {SAVE_DIR}/{(cur_id + 1) % max_size} success !!!")
-#     save_folder = f"{SAVE_DIR}/{max(0, cur_id)}"
-#     os.makedirs(save_folder, exist_ok=True)
-#     print(f"mkdir {save_folder} suceess !!!")
-#     return save_folder
 def gen_save_folder(max_size=200):
     os.makedirs(SAVE_DIR, exist_ok=True)
@@ -139,7 +142,7 @@ def build_model_viewer_html(save_folder, height=660, width=790, textured=False):
         </div>
     """
-@spaces.GPU(duration=60)
 def _gen_shape(
     caption=None,
     image=None,
@@ -246,7 +249,7 @@ def _gen_shape(
     main_image = image if not MV_MODE else image['front']
     return mesh, main_image, save_folder, stats, seed
-@spaces.GPU(duration=70)
 def generation_all(
     caption=None,
     image=None,
@@ -301,7 +304,8 @@ def generation_all(
     path_textured = export_mesh(textured_mesh, save_folder, textured=True)
     model_viewer_html_textured = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH,
                                                          textured=True)
     return (
         gr.update(value=path),
         gr.update(value=path_textured),
@@ -310,7 +314,7 @@ def generation_all(
         seed,
     )
-@spaces.GPU(duration=60)
 def shape_generation(
     caption=None,
     image=None,
@@ -347,7 +351,8 @@ def shape_generation(
     path = export_mesh(mesh, save_folder, textured=False)
     model_viewer_html = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH)
     return (
         gr.update(value=path),
         model_viewer_html,
@@ -362,6 +367,8 @@ def build_app():
         title = 'Hunyuan3D-2mv: Image to 3D Generation with 1-4 Views'
     if 'mini' in args.subfolder:
         title = 'Hunyuan3D-2mini: Strong 0.6B Image to Shape Generator'
     title_html = f"""
     <div style="font-size: 2em; font-weight: bold; text-align: center; margin-bottom: 5px">
@@ -386,11 +393,11 @@ def build_app():
     .mv-image button .wrap {
         font-size: 10px;
     }
     .mv-image .icon-wrap {
         width: 20px;
     }
     """
     with gr.Blocks(theme=gr.themes.Base(), title='Hunyuan-3D-2.0', analytics_enabled=False, css=custom_css) as demo:
@@ -430,7 +437,15 @@ def build_app():
                     file_out = gr.File(label="File", visible=False)
                     file_out2 = gr.File(label="File", visible=False)
-                with gr.Tabs(selected='tab_export'):
                     with gr.Tab('Advanced Options', id='tab_advanced_options'):
                         with gr.Row():
                             check_box_rembg = gr.Checkbox(value=True, label='Remove Background', min_width=100)
@@ -446,14 +461,13 @@ def build_app():
                         with gr.Row():
                             num_steps = gr.Slider(maximum=100,
                                                   minimum=1,
-                                                  value=30,
                                                   step=1, label='Inference Steps')
                             octree_resolution = gr.Slider(maximum=512, minimum=16, value=256, label='Octree Resolution')
                         with gr.Row():
                             cfg_scale = gr.Number(value=5.0, label='Guidance Scale', min_width=100)
-                            num_chunks = gr.Slider(maximum=5000000, minimum=1000, value=200000,
                                                    label='Number of Chunks', min_width=100)
                     with gr.Tab("Export", id='tab_export'):
                         with gr.Row():
                             file_type = gr.Dropdown(label='File Type', choices=SUPPORTED_FORMATS,
@@ -573,6 +587,26 @@ def build_app():
             outputs=[tabs_output],
         )
         def on_export_click(file_out, file_out2, file_type, reduce_face, export_texture, target_face_num):
             if file_out is None:
                 raise gr.Error('Please generate a mesh first.')
@@ -628,18 +662,22 @@ if __name__ == '__main__':
     parser.add_argument('--port', type=int, default=7860)
     parser.add_argument('--host', type=str, default='0.0.0.0')
     parser.add_argument('--device', type=str, default='cuda')
-    parser.add_argument('--mc_algo', type=str, default='dmc')
     parser.add_argument('--cache-path', type=str, default='gradio_cache')
     parser.add_argument('--enable_t23d', action='store_true')
     parser.add_argument('--disable_tex', action='store_true')
     parser.add_argument('--compile', action='store_true')
     args = parser.parse_args()
     SAVE_DIR = args.cache_path
     os.makedirs(SAVE_DIR, exist_ok=True)
     CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
     MV_MODE = 'mv' in args.model_path
     HTML_HEIGHT = 690 if MV_MODE else 650
     HTML_WIDTH = 500
@@ -662,14 +700,15 @@ if __name__ == '__main__':
     example_mvs = get_example_mv_list()
     SUPPORTED_FORMATS = ['glb', 'obj', 'ply', 'stl']
-    args.disable_tex = True
     HAS_TEXTUREGEN = False
     if not args.disable_tex:
         try:
             from hy3dgen.texgen import Hunyuan3DPaintPipeline
             texgen_worker = Hunyuan3DPaintPipeline.from_pretrained(args.texgen_model_path)
             # Not help much, ignore for now.
             # if args.compile:
             #     texgen_worker.models['delight_model'].pipeline.unet.compile()
@@ -699,9 +738,12 @@ if __name__ == '__main__':
     i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
         args.model_path,
         subfolder=args.subfolder,
-        use_safetensors=False,
         device=args.device,
     )
     if args.compile:
         i23d_worker.compile()
@@ -718,6 +760,8 @@ if __name__ == '__main__':
     app.mount("/static", StaticFiles(directory=static_dir, html=True), name="static")
     shutil.copytree('./assets/env_maps', os.path.join(static_dir, 'env_maps'), dirs_exist_ok=True)
     demo = build_app()
     app = gr.mount_gradio_app(app, demo, path="/")
-    uvicorn.run(app, host=args.host, port=args.port)

+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 import os
 import random
 import shutil
 import time
 from glob import glob
 from pathlib import Path
 import gradio as gr
 import torch
 import uvicorn
 from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
+import uuid
 from hy3dgen.shapegen.utils import logger
     print('install custom')
     subprocess.run(shlex.split("pip install custom_rasterizer-0.1-cp310-cp310-linux_x86_64.whl"), check=True)
 def get_example_img_list():
     print('Loading example img list ...')
     return sorted(glob('./assets/example_images/**/*.png', recursive=True))
     root = './assets/example_mv_images'
     for mv_dir in os.listdir(root):
         view_list = []
+        for view in ['front', 'back', 'left', 'right']:
             path = os.path.join(root, mv_dir, f'{view}.png')
             if os.path.exists(path):
                 view_list.append(path)
     return mv_list
 def gen_save_folder(max_size=200):
     os.makedirs(SAVE_DIR, exist_ok=True)
         </div>
     """
 def _gen_shape(
     caption=None,
     image=None,
     main_image = image if not MV_MODE else image['front']
     return mesh, main_image, save_folder, stats, seed
 def generation_all(
     caption=None,
     image=None,
     path_textured = export_mesh(textured_mesh, save_folder, textured=True)
     model_viewer_html_textured = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH,
                                                          textured=True)
+    if args.low_vram_mode:
+        torch.cuda.empty_cache()
     return (
         gr.update(value=path),
         gr.update(value=path_textured),
         seed,
     )
 def shape_generation(
     caption=None,
     image=None,
     path = export_mesh(mesh, save_folder, textured=False)
     model_viewer_html = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH)
+    if args.low_vram_mode:
+        torch.cuda.empty_cache()
     return (
         gr.update(value=path),
         model_viewer_html,
         title = 'Hunyuan3D-2mv: Image to 3D Generation with 1-4 Views'
     if 'mini' in args.subfolder:
         title = 'Hunyuan3D-2mini: Strong 0.6B Image to Shape Generator'
+    if TURBO_MODE:
+        title = title.replace(':', '-Turbo: Fast ')
     title_html = f"""
     <div style="font-size: 2em; font-weight: bold; text-align: center; margin-bottom: 5px">
     .mv-image button .wrap {
         font-size: 10px;
     }
     .mv-image .icon-wrap {
         width: 20px;
     }
     """
     with gr.Blocks(theme=gr.themes.Base(), title='Hunyuan-3D-2.0', analytics_enabled=False, css=custom_css) as demo:
                     file_out = gr.File(label="File", visible=False)
                     file_out2 = gr.File(label="File", visible=False)
+                with gr.Tabs(selected='tab_options' if TURBO_MODE else 'tab_export'):
+                    with gr.Tab("Options", id='tab_options', visible=TURBO_MODE):
+                        gen_mode = gr.Radio(label='Generation Mode',
+                                            info='Recommendation: Turbo for most cases, Fast for very complex cases, Standard seldom use.',
+                                            choices=['Turbo', 'Fast', 'Standard'], value='Turbo')
+                        decode_mode = gr.Radio(label='Decoding Mode',
+                                               info='The resolution for exporting mesh from generated vectset',
+                                               choices=['Low', 'Standard', 'High'],
+                                               value='Standard')
                     with gr.Tab('Advanced Options', id='tab_advanced_options'):
                         with gr.Row():
                             check_box_rembg = gr.Checkbox(value=True, label='Remove Background', min_width=100)
                         with gr.Row():
                             num_steps = gr.Slider(maximum=100,
                                                   minimum=1,
+                                                  value=5 if 'turbo' in args.subfolder else 30,
                                                   step=1, label='Inference Steps')
                             octree_resolution = gr.Slider(maximum=512, minimum=16, value=256, label='Octree Resolution')
                         with gr.Row():
                             cfg_scale = gr.Number(value=5.0, label='Guidance Scale', min_width=100)
+                            num_chunks = gr.Slider(maximum=5000000, minimum=1000, value=8000,
                                                    label='Number of Chunks', min_width=100)
                     with gr.Tab("Export", id='tab_export'):
                         with gr.Row():
                             file_type = gr.Dropdown(label='File Type', choices=SUPPORTED_FORMATS,
             outputs=[tabs_output],
         )
+        def on_gen_mode_change(value):
+            if value == 'Turbo':
+                return gr.update(value=5)
+            elif value == 'Fast':
+                return gr.update(value=10)
+            else:
+                return gr.update(value=30)
+        gen_mode.change(on_gen_mode_change, inputs=[gen_mode], outputs=[num_steps])
+        def on_decode_mode_change(value):
+            if value == 'Low':
+                return gr.update(value=196)
+            elif value == 'Standard':
+                return gr.update(value=256)
+            else:
+                return gr.update(value=384)
+        decode_mode.change(on_decode_mode_change, inputs=[decode_mode], outputs=[octree_resolution])
         def on_export_click(file_out, file_out2, file_type, reduce_face, export_texture, target_face_num):
             if file_out is None:
                 raise gr.Error('Please generate a mesh first.')
     parser.add_argument('--port', type=int, default=7860)
     parser.add_argument('--host', type=str, default='0.0.0.0')
     parser.add_argument('--device', type=str, default='cuda')
+    parser.add_argument('--mc_algo', type=str, default='mc')
     parser.add_argument('--cache-path', type=str, default='gradio_cache')
     parser.add_argument('--enable_t23d', action='store_true')
     parser.add_argument('--disable_tex', action='store_true')
+    parser.add_argument('--enable_flashvdm', action='store_true')
     parser.add_argument('--compile', action='store_true')
+    parser.add_argument('--low_vram_mode', action='store_true')
     args = parser.parse_args()
+    args.enable_flashvdm = True
     SAVE_DIR = args.cache_path
     os.makedirs(SAVE_DIR, exist_ok=True)
     CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
     MV_MODE = 'mv' in args.model_path
+    TURBO_MODE = 'turbo' in args.subfolder
     HTML_HEIGHT = 690 if MV_MODE else 650
     HTML_WIDTH = 500
     example_mvs = get_example_mv_list()
     SUPPORTED_FORMATS = ['glb', 'obj', 'ply', 'stl']
     HAS_TEXTUREGEN = False
     if not args.disable_tex:
         try:
             from hy3dgen.texgen import Hunyuan3DPaintPipeline
             texgen_worker = Hunyuan3DPaintPipeline.from_pretrained(args.texgen_model_path)
+            if args.low_vram_mode:
+                texgen_worker.enable_model_cpu_offload()
             # Not help much, ignore for now.
             # if args.compile:
             #     texgen_worker.models['delight_model'].pipeline.unet.compile()
     i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
         args.model_path,
         subfolder=args.subfolder,
+        use_safetensors=True,
         device=args.device,
     )
+    if args.enable_flashvdm:
+        mc_algo = 'mc' if args.device in ['cpu', 'mps'] else args.mc_algo
+        i23d_worker.enable_flashvdm(mc_algo=mc_algo)
     if args.compile:
         i23d_worker.compile()
     app.mount("/static", StaticFiles(directory=static_dir, html=True), name="static")
     shutil.copytree('./assets/env_maps', os.path.join(static_dir, 'env_maps'), dirs_exist_ok=True)
+    if args.low_vram_mode:
+        torch.cuda.empty_cache()
     demo = build_app()
     app = gr.mount_gradio_app(app, demo, path="/")
+    uvicorn.run(app, host=args.host, port=args.port, workers=1)

hy3dgen/shapegen/__init__.py CHANGED Viewed

@@ -13,5 +13,5 @@
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
-from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover, MeshSimplifier
 from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR

 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
+from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover
 from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR

hy3dgen/shapegen/models/__init__.py CHANGED Viewed

@@ -25,4 +25,4 @@
 from .autoencoders import ShapeVAE
 from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
-from .denoisers import Hunyuan3DDiT

 from .autoencoders import ShapeVAE
 from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
+from .denoisers import HunYuanDiTPlain, Hunyuan3DDiT

hy3dgen/shapegen/models/conditioner.py CHANGED Viewed

@@ -22,7 +22,6 @@
 # fine-tuning enabling code and other elements of the foregoing made publicly available
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-import numpy as np
 import torch
 import torch.nn as nn
 from torchvision import transforms
@@ -34,26 +33,6 @@ from transformers import (
 )
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
-    """
-    embed_dim: output dimension for each position
-    pos: a list of positions to be encoded: size (M,)
-    out: (M, D)
-    """
-    assert embed_dim % 2 == 0
-    omega = np.arange(embed_dim // 2, dtype=np.float64)
-    omega /= embed_dim / 2.
-    omega = 1. / 10000 ** omega  # (D/2,)
-    pos = pos.reshape(-1)  # (M,)
-    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
-    emb_sin = np.sin(out)  # (M, D/2)
-    emb_cos = np.cos(out)  # (M, D/2)
-    return np.concatenate([emb_sin, emb_cos], axis=1)
 class ImageEncoder(nn.Module):
     def __init__(
         self,
@@ -88,7 +67,7 @@ class ImageEncoder(nn.Module):
             ]
         )
-    def forward(self, image, mask=None, value_range=(-1, 1), **kwargs):
         if value_range is not None:
             low, high = value_range
             image = (image - low) / (high - low)
@@ -103,7 +82,7 @@ class ImageEncoder(nn.Module):
         return last_hidden_state
-    def unconditional_embedding(self, batch_size, **kwargs):
         device = next(self.model.parameters()).device
         dtype = next(self.model.parameters()).dtype
         zero = torch.zeros(
@@ -131,82 +110,11 @@ class DinoImageEncoder(ImageEncoder):
     std = [0.229, 0.224, 0.225]
-class DinoImageEncoderMV(DinoImageEncoder):
-    def __init__(
-        self,
-        version=None,
-        config=None,
-        use_cls_token=True,
-        image_size=224,
-        view_num=4,
-        **kwargs,
-    ):
-        super().__init__(version, config, use_cls_token, image_size, **kwargs)
-        self.view_num = view_num
-        self.num_patches = self.num_patches
-        pos = np.arange(self.view_num, dtype=np.float32)
-        view_embedding = torch.from_numpy(
-            get_1d_sincos_pos_embed_from_grid(self.model.config.hidden_size, pos)).float()
-        view_embedding = view_embedding.unsqueeze(1).repeat(1, self.num_patches, 1)
-        self.view_embed = view_embedding.unsqueeze(0)
-    def forward(self, image, mask=None, value_range=(-1, 1), view_idxs=None):
-        if value_range is not None:
-            low, high = value_range
-            image = (image - low) / (high - low)
-        image = image.to(self.model.device, dtype=self.model.dtype)
-        bs, num_views, c, h, w = image.shape
-        image = image.view(bs * num_views, c, h, w)
-        inputs = self.transform(image)
-        outputs = self.model(inputs)
-        last_hidden_state = outputs.last_hidden_state
-        last_hidden_state = last_hidden_state.view(
-            bs, num_views, last_hidden_state.shape[-2],
-            last_hidden_state.shape[-1]
-        )
-        view_embedding = self.view_embed.to(last_hidden_state.dtype).to(last_hidden_state.device)
-        if view_idxs is not None:
-            assert len(view_idxs) == bs
-            view_embeddings = []
-            for i in range(bs):
-                view_idx = view_idxs[i]
-                assert num_views == len(view_idx)
-                view_embeddings.append(self.view_embed[:, view_idx, ...])
-            view_embedding = torch.cat(view_embeddings, 0).to(last_hidden_state.dtype).to(last_hidden_state.device)
-        if num_views != self.view_num:
-            view_embedding = view_embedding[:, :num_views, ...]
-        last_hidden_state = last_hidden_state + view_embedding
-        last_hidden_state = last_hidden_state.view(bs, num_views * last_hidden_state.shape[-2],
-                                                   last_hidden_state.shape[-1])
-        return last_hidden_state
-    def unconditional_embedding(self, batch_size, view_idxs=None, **kwargs):
-        device = next(self.model.parameters()).device
-        dtype = next(self.model.parameters()).dtype
-        zero = torch.zeros(
-            batch_size,
-            self.num_patches * len(view_idxs[0]),
-            self.model.config.hidden_size,
-            device=device,
-            dtype=dtype,
-        )
-        return zero
 def build_image_encoder(config):
     if config['type'] == 'CLIPImageEncoder':
         return CLIPImageEncoder(**config['kwargs'])
     elif config['type'] == 'DinoImageEncoder':
         return DinoImageEncoder(**config['kwargs'])
-    elif config['type'] == 'DinoImageEncoderMV':
-        return DinoImageEncoderMV(**config['kwargs'])
     else:
         raise ValueError(f'Unknown image encoder type: {config["type"]}')
@@ -221,17 +129,17 @@ class DualImageEncoder(nn.Module):
         self.main_image_encoder = build_image_encoder(main_image_encoder)
         self.additional_image_encoder = build_image_encoder(additional_image_encoder)
-    def forward(self, image, mask=None, **kwargs):
         outputs = {
-            'main': self.main_image_encoder(image, mask=mask, **kwargs),
-            'additional': self.additional_image_encoder(image, mask=mask, **kwargs),
         }
         return outputs
-    def unconditional_embedding(self, batch_size, **kwargs):
         outputs = {
-            'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
-            'additional': self.additional_image_encoder.unconditional_embedding(batch_size, **kwargs),
         }
         return outputs
@@ -244,14 +152,14 @@ class SingleImageEncoder(nn.Module):
         super().__init__()
         self.main_image_encoder = build_image_encoder(main_image_encoder)
-    def forward(self, image, mask=None, **kwargs):
         outputs = {
-            'main': self.main_image_encoder(image, mask=mask, **kwargs),
         }
         return outputs
-    def unconditional_embedding(self, batch_size, **kwargs):
         outputs = {
-            'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
         }
         return outputs

 # fine-tuning enabling code and other elements of the foregoing made publicly available
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 import torch
 import torch.nn as nn
 from torchvision import transforms
 )
 class ImageEncoder(nn.Module):
     def __init__(
         self,
             ]
         )
+    def forward(self, image, mask=None, value_range=(-1, 1)):
         if value_range is not None:
             low, high = value_range
             image = (image - low) / (high - low)
         return last_hidden_state
+    def unconditional_embedding(self, batch_size):
         device = next(self.model.parameters()).device
         dtype = next(self.model.parameters()).dtype
         zero = torch.zeros(
     std = [0.229, 0.224, 0.225]
 def build_image_encoder(config):
     if config['type'] == 'CLIPImageEncoder':
         return CLIPImageEncoder(**config['kwargs'])
     elif config['type'] == 'DinoImageEncoder':
         return DinoImageEncoder(**config['kwargs'])
     else:
         raise ValueError(f'Unknown image encoder type: {config["type"]}')
         self.main_image_encoder = build_image_encoder(main_image_encoder)
         self.additional_image_encoder = build_image_encoder(additional_image_encoder)
+    def forward(self, image, mask=None):
         outputs = {
+            'main': self.main_image_encoder(image, mask=mask),
+            'additional': self.additional_image_encoder(image, mask=mask),
         }
         return outputs
+    def unconditional_embedding(self, batch_size):
         outputs = {
+            'main': self.main_image_encoder.unconditional_embedding(batch_size),
+            'additional': self.additional_image_encoder.unconditional_embedding(batch_size),
         }
         return outputs
         super().__init__()
         self.main_image_encoder = build_image_encoder(main_image_encoder)
+    def forward(self, image, mask=None):
         outputs = {
+            'main': self.main_image_encoder(image, mask=mask),
         }
         return outputs
+    def unconditional_embedding(self, batch_size):
         outputs = {
+            'main': self.main_image_encoder.unconditional_embedding(batch_size),
         }
         return outputs

hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py CHANGED Viewed

@@ -60,15 +60,6 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
     return embedding
-class GELU(nn.Module):
-    def __init__(self, approximate='tanh'):
-        super().__init__()
-        self.approximate = approximate
-    def forward(self, x: Tensor) -> Tensor:
-        return nn.functional.gelu(x.contiguous(), approximate=self.approximate)
 class MLPEmbedder(nn.Module):
     def __init__(self, in_dim: int, hidden_dim: int):
         super().__init__()
@@ -171,7 +162,7 @@ class DoubleStreamBlock(nn.Module):
         self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.img_mlp = nn.Sequential(
             nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
-            GELU(approximate="tanh"),
             nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
@@ -182,7 +173,7 @@ class DoubleStreamBlock(nn.Module):
         self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.txt_mlp = nn.Sequential(
             nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
-            GELU(approximate="tanh"),
             nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
@@ -248,7 +239,7 @@ class SingleStreamBlock(nn.Module):
         self.hidden_size = hidden_size
         self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.mlp_act = GELU(approximate="tanh")
         self.modulation = Modulation(hidden_size, double=False)
     def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:

     return embedding
 class MLPEmbedder(nn.Module):
     def __init__(self, in_dim: int, hidden_dim: int):
         super().__init__()
         self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.img_mlp = nn.Sequential(
             nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
             nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
         self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.txt_mlp = nn.Sequential(
             nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
             nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
         self.hidden_size = hidden_size
         self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
         self.modulation = Modulation(hidden_size, double=False)
     def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:

hy3dgen/shapegen/pipelines.py CHANGED Viewed

@@ -24,12 +24,11 @@ import trimesh
 import yaml
 from PIL import Image
 from diffusers.utils.torch_utils import randn_tensor
-from diffusers.utils.import_utils import is_accelerate_version, is_accelerate_available
 from tqdm import tqdm
 from .models.autoencoders import ShapeVAE
 from .models.autoencoders import SurfaceExtractors
-from .utils import logger, synchronize_timer, smart_load_model
 def retrieve_timesteps(
@@ -128,9 +127,6 @@ def instantiate_from_config(config, **kwargs):
 class Hunyuan3DDiTPipeline:
-    model_cpu_offload_seq = "conditioner->model->vae"
-    _exclude_from_cpu_offload = []
     @classmethod
     @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
     def from_single_file(
@@ -211,12 +207,34 @@ class Hunyuan3DDiTPipeline:
             dtype=dtype,
             device=device,
         )
-        config_path, ckpt_path = smart_load_model(
-            model_path,
-            subfolder=subfolder,
-            use_safetensors=use_safetensors,
-            variant=variant
-        )
         return cls.from_single_file(
             ckpt_path,
             config_path,
@@ -261,18 +279,12 @@ class Hunyuan3DDiTPipeline:
         if enabled:
             model_path = self.kwargs['from_pretrained_kwargs']['model_path']
             turbo_vae_mapping = {
-                'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
-                'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
-                'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini-turbo'),
             }
             model_name = model_path.split('/')[-1]
             if replace_vae and model_name in turbo_vae_mapping:
-                model_path, subfolder = turbo_vae_mapping[model_name]
-                self.vae = ShapeVAE.from_pretrained(
-                    model_path, subfolder=subfolder,
-                    use_safetensors=self.kwargs['from_pretrained_kwargs']['use_safetensors'],
-                    device=self.device,
-                )
             self.vae.enable_flashvdm_decoder(
                 enabled=enabled,
                 adaptive_kv_selection=adaptive_kv_selection,
@@ -282,146 +294,33 @@ class Hunyuan3DDiTPipeline:
         else:
             model_path = self.kwargs['from_pretrained_kwargs']['model_path']
             vae_mapping = {
-                'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
-                'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
-                'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini'),
             }
             model_name = model_path.split('/')[-1]
             if model_name in vae_mapping:
-                model_path, subfolder = vae_mapping[model_name]
-                self.vae = ShapeVAE.from_pretrained(model_path, subfolder=subfolder)
             self.vae.enable_flashvdm_decoder(enabled=False)
     def to(self, device=None, dtype=None):
-        if dtype is not None:
-            self.dtype = dtype
-            self.vae.to(dtype=dtype)
-            self.model.to(dtype=dtype)
-            self.conditioner.to(dtype=dtype)
         if device is not None:
             self.device = torch.device(device)
             self.vae.to(device)
             self.model.to(device)
             self.conditioner.to(device)
-    @property
-    def _execution_device(self):
-        r"""
-        Returns the device on which the pipeline's models will be executed. After calling
-        [`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
-        Accelerate's module hooks.
-        """
-        for name, model in self.components.items():
-            if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload:
-                continue
-            if not hasattr(model, "_hf_hook"):
-                return self.device
-            for module in model.modules():
-                if (
-                    hasattr(module, "_hf_hook")
-                    and hasattr(module._hf_hook, "execution_device")
-                    and module._hf_hook.execution_device is not None
-                ):
-                    return torch.device(module._hf_hook.execution_device)
-        return self.device
-    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        Arguments:
-            gpu_id (`int`, *optional*):
-                The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
-            device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
-                The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
-                default to "cuda".
-        """
-        if self.model_cpu_offload_seq is None:
-            raise ValueError(
-                "Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
-            )
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        torch_device = torch.device(device)
-        device_index = torch_device.index
-        if gpu_id is not None and device_index is not None:
-            raise ValueError(
-                f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
-                f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
-            )
-        # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
-        self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
-        device_type = torch_device.type
-        device = torch.device(f"{device_type}:{self._offload_gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu")
-            device_mod = getattr(torch, self.device.type, None)
-            if hasattr(device_mod, "empty_cache") and device_mod.is_available():
-                device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
-        self._all_hooks = []
-        hook = None
-        for model_str in self.model_cpu_offload_seq.split("->"):
-            model = all_model_components.pop(model_str, None)
-            if not isinstance(model, torch.nn.Module):
-                continue
-            _, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
-            self._all_hooks.append(hook)
-        # CPU offload models that are not in the seq chain unless they are explicitly excluded
-        # these models will stay on CPU until maybe_free_model_hooks is called
-        # some models cannot be in the seq chain because they are iteratively called, such as controlnet
-        for name, model in all_model_components.items():
-            if not isinstance(model, torch.nn.Module):
-                continue
-            if name in self._exclude_from_cpu_offload:
-                model.to(device)
-            else:
-                _, hook = cpu_offload_with_hook(model, device)
-                self._all_hooks.append(hook)
-    def maybe_free_model_hooks(self):
-        r"""
-        Function that offloads all components, removes all model hooks that were added when using
-        `enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function
-        is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it
-        functions correctly when applying enable_model_cpu_offload.
-        """
-        if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
-            # `enable_model_cpu_offload` has not be called, so silently do nothing
-            return
-        for hook in self._all_hooks:
-            # offload model and remove hook from model
-            hook.offload()
-            hook.remove()
-        # make sure the model is in the same state as before calling it
-        self.enable_model_cpu_offload()
     @synchronize_timer('Encode cond')
-    def encode_cond(self, image, additional_cond_inputs, do_classifier_free_guidance, dual_guidance):
         bsz = image.shape[0]
-        cond = self.conditioner(image=image, **additional_cond_inputs)
         if do_classifier_free_guidance:
-            un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
             if dual_guidance:
                 un_cond_drop_main = copy.deepcopy(un_cond)
@@ -437,6 +336,8 @@ class Hunyuan3DDiTPipeline:
                 cond = cat_recursive(cond, un_cond_drop_main, un_cond)
             else:
                 def cat_recursive(a, b):
                     if isinstance(a, torch.Tensor):
                         return torch.cat([a, b], dim=0).to(self.dtype)
@@ -482,27 +383,25 @@ class Hunyuan3DDiTPipeline:
         latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
         return latents
-    def prepare_image(self, image) -> dict:
         if isinstance(image, str) and not os.path.exists(image):
             raise FileNotFoundError(f"Couldn't find image at path {image}")
         if not isinstance(image, list):
             image = [image]
-        outputs = []
         for img in image:
-            output = self.image_processor(img)
-            outputs.append(output)
-        cond_input = {k: [] for k in outputs[0].keys()}
-        for output in outputs:
-            for key, value in output.items():
-                cond_input[key].append(value)
-        for key, value in cond_input.items():
-            if isinstance(value[0], torch.Tensor):
-                cond_input[key] = torch.cat(value, dim=0)
-        return cond_input
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
@@ -575,14 +474,10 @@ class Hunyuan3DDiTPipeline:
                                       getattr(self.model, 'guidance_cond_proj_dim', None) is None
         dual_guidance = dual_guidance_scale >= 0 and dual_guidance
-        cond_inputs = self.prepare_image(image)
-        image = cond_inputs.pop('image')
-        cond = self.encode_cond(
-            image=image,
-            additional_cond_inputs=cond_inputs,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            dual_guidance=False,
-        )
         batch_size = image.shape[0]
         t_dtype = torch.long
@@ -640,17 +535,7 @@ class Hunyuan3DDiTPipeline:
             box_v, mc_level, num_chunks, octree_resolution, mc_algo,
         )
-    def _export(
-        self,
-        latents,
-        output_type='trimesh',
-        box_v=1.01,
-        mc_level=0.0,
-        num_chunks=20000,
-        octree_resolution=256,
-        mc_algo='mc',
-        enable_pbar=True
-    ):
         if not output_type == "latent":
             latents = 1. / self.vae.scale_factor * latents
             latents = self.vae(latents)
@@ -677,7 +562,7 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
     @torch.inference_mode()
     def __call__(
         self,
-        image: Union[str, List[str], Image.Image, dict, List[dict]] = None,
         num_inference_steps: int = 50,
         timesteps: List[int] = None,
         sigmas: List[float] = None,
@@ -705,11 +590,10 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
             self.model.guidance_embed is True
         )
-        cond_inputs = self.prepare_image(image)
-        image = cond_inputs.pop('image')
         cond = self.encode_cond(
             image=image,
-            additional_cond_inputs=cond_inputs,
             do_classifier_free_guidance=do_classifier_free_guidance,
             dual_guidance=False,
         )

 import yaml
 from PIL import Image
 from diffusers.utils.torch_utils import randn_tensor
 from tqdm import tqdm
 from .models.autoencoders import ShapeVAE
 from .models.autoencoders import SurfaceExtractors
+from .utils import logger, synchronize_timer
 def retrieve_timesteps(
 class Hunyuan3DDiTPipeline:
     @classmethod
     @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
     def from_single_file(
             dtype=dtype,
             device=device,
         )
+        original_model_path = model_path
+        # try local path
+        base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
+        model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder))
+        logger.info(f'Try to load model from local path: {model_path}')
+        if not os.path.exists(model_path):
+            logger.info('Model path not exists, try to download from huggingface')
+            try:
+                import huggingface_hub
+                # download from huggingface
+                path = huggingface_hub.snapshot_download(repo_id=original_model_path)
+                model_path = os.path.join(path, subfolder)
+            except ImportError:
+                logger.warning(
+                    "You need to install HuggingFace Hub to load models from the hub."
+                )
+                raise RuntimeError(f"Model path {model_path} not found")
+            except Exception as e:
+                raise e
+        if not os.path.exists(model_path):
+            raise FileNotFoundError(f"Model path {original_model_path} not found")
+        extension = 'ckpt' if not use_safetensors else 'safetensors'
+        variant = '' if variant is None else f'.{variant}'
+        ckpt_name = f'model{variant}.{extension}'
+        config_path = os.path.join(model_path, 'config.yaml')
+        ckpt_path = os.path.join(model_path, ckpt_name)
         return cls.from_single_file(
             ckpt_path,
             config_path,
         if enabled:
             model_path = self.kwargs['from_pretrained_kwargs']['model_path']
             turbo_vae_mapping = {
+                'Hunyuan3D-2': 'hunyuan3d-vae-v2-0-turbo',
+                'Hunyuan3D-2s': 'hunyuan3d-vae-v2-s-turbo'
             }
             model_name = model_path.split('/')[-1]
             if replace_vae and model_name in turbo_vae_mapping:
+                self.vae = ShapeVAE.from_pretrained(model_path, subfolder=turbo_vae_mapping[model_name])
             self.vae.enable_flashvdm_decoder(
                 enabled=enabled,
                 adaptive_kv_selection=adaptive_kv_selection,
         else:
             model_path = self.kwargs['from_pretrained_kwargs']['model_path']
             vae_mapping = {
+                'Hunyuan3D-2': 'hunyuan3d-vae-v2-0',
+                'Hunyuan3D-2s': 'hunyuan3d-vae-v2-s'
             }
             model_name = model_path.split('/')[-1]
             if model_name in vae_mapping:
+                self.vae = ShapeVAE.from_pretrained(model_path, subfolder=vae_mapping[model_name])
             self.vae.enable_flashvdm_decoder(enabled=False)
     def to(self, device=None, dtype=None):
         if device is not None:
             self.device = torch.device(device)
             self.vae.to(device)
             self.model.to(device)
             self.conditioner.to(device)
+        if dtype is not None:
+            self.dtype = dtype
+            self.vae.to(dtype=dtype)
+            self.model.to(dtype=dtype)
+            self.conditioner.to(dtype=dtype)
     @synchronize_timer('Encode cond')
+    def encode_cond(self, image, mask, do_classifier_free_guidance, dual_guidance):
         bsz = image.shape[0]
+        cond = self.conditioner(image=image, mask=mask)
         if do_classifier_free_guidance:
+            un_cond = self.conditioner.unconditional_embedding(bsz)
             if dual_guidance:
                 un_cond_drop_main = copy.deepcopy(un_cond)
                 cond = cat_recursive(cond, un_cond_drop_main, un_cond)
             else:
+                un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
                 def cat_recursive(a, b):
                     if isinstance(a, torch.Tensor):
                         return torch.cat([a, b], dim=0).to(self.dtype)
         latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
         return latents
+    def prepare_image(self, image):
         if isinstance(image, str) and not os.path.exists(image):
             raise FileNotFoundError(f"Couldn't find image at path {image}")
         if not isinstance(image, list):
             image = [image]
+        image_pts = []
+        mask_pts = []
         for img in image:
+            image_pt, mask_pt = self.image_processor(img, return_mask=True)
+            image_pts.append(image_pt)
+            mask_pts.append(mask_pt)
+        image_pts = torch.cat(image_pts, dim=0).to(self.device, dtype=self.dtype)
+        if mask_pts[0] is not None:
+            mask_pts = torch.cat(mask_pts, dim=0).to(self.device, dtype=self.dtype)
+        else:
+            mask_pts = None
+        return image_pts, mask_pts
     def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
         """
                                       getattr(self.model, 'guidance_cond_proj_dim', None) is None
         dual_guidance = dual_guidance_scale >= 0 and dual_guidance
+        image, mask = self.prepare_image(image)
+        cond = self.encode_cond(image=image,
+                                do_classifier_free_guidance=do_classifier_free_guidance,
+                                dual_guidance=dual_guidance)
         batch_size = image.shape[0]
         t_dtype = torch.long
             box_v, mc_level, num_chunks, octree_resolution, mc_algo,
         )
+    def _export(self, latents, output_type, box_v, mc_level, num_chunks, octree_resolution, mc_algo, enable_pbar=True):
         if not output_type == "latent":
             latents = 1. / self.vae.scale_factor * latents
             latents = self.vae(latents)
     @torch.inference_mode()
     def __call__(
         self,
+        image: Union[str, List[str], Image.Image] = None,
         num_inference_steps: int = 50,
         timesteps: List[int] = None,
         sigmas: List[float] = None,
             self.model.guidance_embed is True
         )
+        image, mask = self.prepare_image(image)
         cond = self.encode_cond(
             image=image,
+            mask=mask,
             do_classifier_free_guidance=do_classifier_free_guidance,
             dual_guidance=False,
         )

hy3dgen/shapegen/postprocessors.py CHANGED Viewed

@@ -12,16 +12,13 @@
 # fine-tuning enabling code and other elements of the foregoing made publicly available
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
-import os
 import tempfile
 from typing import Union
-import numpy as np
 import pymeshlab
-import torch
 import trimesh
-from .models.autoencoders import Latent2MeshOutput
 from .utils import synchronize_timer

 # fine-tuning enabling code and other elements of the foregoing made publicly available
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 import tempfile
 from typing import Union
 import pymeshlab
 import trimesh
+from .models.vae import Latent2MeshOutput
 from .utils import synchronize_timer

hy3dgen/shapegen/preprocessors.py CHANGED Viewed

@@ -87,7 +87,9 @@ class ImageProcessorV2:
         mask = mask.clip(0, 255).astype(np.uint8)
         return result, mask
-    def load_image(self, image, border_ratio=0.15, to_tensor=True):
         if isinstance(image, str):
             image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
             image, mask = self.recenter(image, border_ratio=border_ratio)
@@ -104,64 +106,13 @@ class ImageProcessorV2:
         if to_tensor:
             image = array_to_tensor(image)
             mask = array_to_tensor(mask)
-        return image, mask
-    def __call__(self, image, border_ratio=0.15, to_tensor=True, **kwargs):
-        if self.border_ratio is not None:
-            border_ratio = self.border_ratio
-        image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
-        outputs = {
-            'image': image,
-            'mask': mask
-        }
-        return outputs
-class MVImageProcessorV2(ImageProcessorV2):
-    """
-    view order: front, front clockwise 90, back, front clockwise 270
-    """
-    return_view_idx = True
-    def __init__(self, size=512, border_ratio=None):
-        super().__init__(size, border_ratio)
-        self.view2idx = {
-            'front': 0,
-            'left': 1,
-            'back': 2,
-            'right': 3
-        }
-    def __call__(self, image_dict, border_ratio=0.15, to_tensor=True, **kwargs):
-        if self.border_ratio is not None:
-            border_ratio = self.border_ratio
-        images = []
-        masks = []
-        view_idxs = []
-        for idx, (view_tag, image) in enumerate(image_dict.items()):
-            view_idxs.append(self.view2idx[view_tag])
-            image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
-            images.append(image)
-            masks.append(mask)
-        zipped_lists = zip(view_idxs, images, masks)
-        sorted_zipped_lists = sorted(zipped_lists)
-        view_idxs, images, masks = zip(*sorted_zipped_lists)
-        image = torch.cat(images, 0).unsqueeze(0)
-        mask = torch.cat(masks, 0).unsqueeze(0)
-        outputs = {
-            'image': image,
-            'mask': mask,
-            'view_idxs': view_idxs
-        }
-        return outputs
 IMAGE_PROCESSORS = {
     "v2": ImageProcessorV2,
-    'mv_v2': MVImageProcessorV2,
 }
 DEFAULT_IMAGEPROCESSOR = 'v2'

         mask = mask.clip(0, 255).astype(np.uint8)
         return result, mask
+    def __call__(self, image, border_ratio=0.15, to_tensor=True, return_mask=False, **kwargs):
+        if self.border_ratio is not None:
+            border_ratio = self.border_ratio
         if isinstance(image, str):
             image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
             image, mask = self.recenter(image, border_ratio=border_ratio)
         if to_tensor:
             image = array_to_tensor(image)
             mask = array_to_tensor(mask)
+        if return_mask:
+            return image, mask
+        return image
 IMAGE_PROCESSORS = {
     "v2": ImageProcessorV2,
 }
 DEFAULT_IMAGEPROCESSOR = 'v2'