ZeqiangLai commited on
Commit
b530233
·
2 Parent(s): 6f34049 c918288

Merge branch 'main' of https://huggingface.co/spaces/tencent/Hunyuan3D-2mv

Browse files
gradio_app.py CHANGED
@@ -1,10 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import random
3
  import shutil
4
  import time
5
  from glob import glob
6
  from pathlib import Path
7
- import uuid
8
 
9
  import gradio as gr
10
  import torch
@@ -12,6 +25,7 @@ import trimesh
12
  import uvicorn
13
  from fastapi import FastAPI
14
  from fastapi.staticfiles import StaticFiles
 
15
 
16
  from hy3dgen.shapegen.utils import logger
17
 
@@ -28,6 +42,7 @@ if True:
28
  print('install custom')
29
  subprocess.run(shlex.split("pip install custom_rasterizer-0.1-cp310-cp310-linux_x86_64.whl"), check=True)
30
 
 
31
  def get_example_img_list():
32
  print('Loading example img list ...')
33
  return sorted(glob('./assets/example_images/**/*.png', recursive=True))
@@ -47,7 +62,7 @@ def get_example_mv_list():
47
  root = './assets/example_mv_images'
48
  for mv_dir in os.listdir(root):
49
  view_list = []
50
- for view in ['正视图', '背视图', '左视图', '右视图']:
51
  path = os.path.join(root, mv_dir, f'{view}.png')
52
  if os.path.exists(path):
53
  view_list.append(path)
@@ -57,18 +72,6 @@ def get_example_mv_list():
57
  return mv_list
58
 
59
 
60
- # def gen_save_folder(max_size=60):
61
- # os.makedirs(SAVE_DIR, exist_ok=True)
62
- # exists = set(int(_) for _ in os.listdir(SAVE_DIR) if _.isdigit())
63
- # cur_id = min(set(range(max_size)) - exists) if len(exists) < max_size else -1
64
- # if os.path.exists(f"{SAVE_DIR}/{(cur_id + 1) % max_size}"):
65
- # shutil.rmtree(f"{SAVE_DIR}/{(cur_id + 1) % max_size}")
66
- # print(f"remove {SAVE_DIR}/{(cur_id + 1) % max_size} success !!!")
67
- # save_folder = f"{SAVE_DIR}/{max(0, cur_id)}"
68
- # os.makedirs(save_folder, exist_ok=True)
69
- # print(f"mkdir {save_folder} suceess !!!")
70
- # return save_folder
71
-
72
  def gen_save_folder(max_size=200):
73
  os.makedirs(SAVE_DIR, exist_ok=True)
74
 
@@ -139,7 +142,7 @@ def build_model_viewer_html(save_folder, height=660, width=790, textured=False):
139
  </div>
140
  """
141
 
142
- @spaces.GPU(duration=60)
143
  def _gen_shape(
144
  caption=None,
145
  image=None,
@@ -246,7 +249,7 @@ def _gen_shape(
246
  main_image = image if not MV_MODE else image['front']
247
  return mesh, main_image, save_folder, stats, seed
248
 
249
- @spaces.GPU(duration=70)
250
  def generation_all(
251
  caption=None,
252
  image=None,
@@ -301,7 +304,8 @@ def generation_all(
301
  path_textured = export_mesh(textured_mesh, save_folder, textured=True)
302
  model_viewer_html_textured = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH,
303
  textured=True)
304
-
 
305
  return (
306
  gr.update(value=path),
307
  gr.update(value=path_textured),
@@ -310,7 +314,7 @@ def generation_all(
310
  seed,
311
  )
312
 
313
- @spaces.GPU(duration=60)
314
  def shape_generation(
315
  caption=None,
316
  image=None,
@@ -347,7 +351,8 @@ def shape_generation(
347
 
348
  path = export_mesh(mesh, save_folder, textured=False)
349
  model_viewer_html = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH)
350
-
 
351
  return (
352
  gr.update(value=path),
353
  model_viewer_html,
@@ -362,6 +367,8 @@ def build_app():
362
  title = 'Hunyuan3D-2mv: Image to 3D Generation with 1-4 Views'
363
  if 'mini' in args.subfolder:
364
  title = 'Hunyuan3D-2mini: Strong 0.6B Image to Shape Generator'
 
 
365
 
366
  title_html = f"""
367
  <div style="font-size: 2em; font-weight: bold; text-align: center; margin-bottom: 5px">
@@ -386,11 +393,11 @@ def build_app():
386
  .mv-image button .wrap {
387
  font-size: 10px;
388
  }
389
-
390
  .mv-image .icon-wrap {
391
  width: 20px;
392
  }
393
-
394
  """
395
 
396
  with gr.Blocks(theme=gr.themes.Base(), title='Hunyuan-3D-2.0', analytics_enabled=False, css=custom_css) as demo:
@@ -430,7 +437,15 @@ def build_app():
430
  file_out = gr.File(label="File", visible=False)
431
  file_out2 = gr.File(label="File", visible=False)
432
 
433
- with gr.Tabs(selected='tab_export'):
 
 
 
 
 
 
 
 
434
  with gr.Tab('Advanced Options', id='tab_advanced_options'):
435
  with gr.Row():
436
  check_box_rembg = gr.Checkbox(value=True, label='Remove Background', min_width=100)
@@ -446,14 +461,13 @@ def build_app():
446
  with gr.Row():
447
  num_steps = gr.Slider(maximum=100,
448
  minimum=1,
449
- value=30,
450
  step=1, label='Inference Steps')
451
  octree_resolution = gr.Slider(maximum=512, minimum=16, value=256, label='Octree Resolution')
452
  with gr.Row():
453
  cfg_scale = gr.Number(value=5.0, label='Guidance Scale', min_width=100)
454
- num_chunks = gr.Slider(maximum=5000000, minimum=1000, value=200000,
455
  label='Number of Chunks', min_width=100)
456
-
457
  with gr.Tab("Export", id='tab_export'):
458
  with gr.Row():
459
  file_type = gr.Dropdown(label='File Type', choices=SUPPORTED_FORMATS,
@@ -573,6 +587,26 @@ def build_app():
573
  outputs=[tabs_output],
574
  )
575
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
  def on_export_click(file_out, file_out2, file_type, reduce_face, export_texture, target_face_num):
577
  if file_out is None:
578
  raise gr.Error('Please generate a mesh first.')
@@ -628,18 +662,22 @@ if __name__ == '__main__':
628
  parser.add_argument('--port', type=int, default=7860)
629
  parser.add_argument('--host', type=str, default='0.0.0.0')
630
  parser.add_argument('--device', type=str, default='cuda')
631
- parser.add_argument('--mc_algo', type=str, default='dmc')
632
  parser.add_argument('--cache-path', type=str, default='gradio_cache')
633
  parser.add_argument('--enable_t23d', action='store_true')
634
  parser.add_argument('--disable_tex', action='store_true')
 
635
  parser.add_argument('--compile', action='store_true')
 
636
  args = parser.parse_args()
637
 
 
638
  SAVE_DIR = args.cache_path
639
  os.makedirs(SAVE_DIR, exist_ok=True)
640
 
641
  CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
642
  MV_MODE = 'mv' in args.model_path
 
643
 
644
  HTML_HEIGHT = 690 if MV_MODE else 650
645
  HTML_WIDTH = 500
@@ -662,14 +700,15 @@ if __name__ == '__main__':
662
  example_mvs = get_example_mv_list()
663
 
664
  SUPPORTED_FORMATS = ['glb', 'obj', 'ply', 'stl']
665
-
666
- args.disable_tex = True
667
  HAS_TEXTUREGEN = False
668
  if not args.disable_tex:
669
  try:
670
  from hy3dgen.texgen import Hunyuan3DPaintPipeline
671
 
672
  texgen_worker = Hunyuan3DPaintPipeline.from_pretrained(args.texgen_model_path)
 
 
673
  # Not help much, ignore for now.
674
  # if args.compile:
675
  # texgen_worker.models['delight_model'].pipeline.unet.compile()
@@ -699,9 +738,12 @@ if __name__ == '__main__':
699
  i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
700
  args.model_path,
701
  subfolder=args.subfolder,
702
- use_safetensors=False,
703
  device=args.device,
704
  )
 
 
 
705
  if args.compile:
706
  i23d_worker.compile()
707
 
@@ -718,6 +760,8 @@ if __name__ == '__main__':
718
  app.mount("/static", StaticFiles(directory=static_dir, html=True), name="static")
719
  shutil.copytree('./assets/env_maps', os.path.join(static_dir, 'env_maps'), dirs_exist_ok=True)
720
 
 
 
721
  demo = build_app()
722
  app = gr.mount_gradio_app(app, demo, path="/")
723
- uvicorn.run(app, host=args.host, port=args.port)
 
1
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
2
+ # except for the third-party components listed below.
3
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
4
+ # in the repsective licenses of these third-party components.
5
+ # Users must comply with all terms and conditions of original licenses of these third-party
6
+ # components and must ensure that the usage of the third party components adheres to
7
+ # all relevant laws and regulations.
8
+
9
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
10
+ # their software and algorithms, including trained model weights, parameters (including
11
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
12
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
13
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
14
+
15
  import os
16
  import random
17
  import shutil
18
  import time
19
  from glob import glob
20
  from pathlib import Path
 
21
 
22
  import gradio as gr
23
  import torch
 
25
  import uvicorn
26
  from fastapi import FastAPI
27
  from fastapi.staticfiles import StaticFiles
28
+ import uuid
29
 
30
  from hy3dgen.shapegen.utils import logger
31
 
 
42
  print('install custom')
43
  subprocess.run(shlex.split("pip install custom_rasterizer-0.1-cp310-cp310-linux_x86_64.whl"), check=True)
44
 
45
+
46
  def get_example_img_list():
47
  print('Loading example img list ...')
48
  return sorted(glob('./assets/example_images/**/*.png', recursive=True))
 
62
  root = './assets/example_mv_images'
63
  for mv_dir in os.listdir(root):
64
  view_list = []
65
+ for view in ['front', 'back', 'left', 'right']:
66
  path = os.path.join(root, mv_dir, f'{view}.png')
67
  if os.path.exists(path):
68
  view_list.append(path)
 
72
  return mv_list
73
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  def gen_save_folder(max_size=200):
76
  os.makedirs(SAVE_DIR, exist_ok=True)
77
 
 
142
  </div>
143
  """
144
 
145
+
146
  def _gen_shape(
147
  caption=None,
148
  image=None,
 
249
  main_image = image if not MV_MODE else image['front']
250
  return mesh, main_image, save_folder, stats, seed
251
 
252
+
253
  def generation_all(
254
  caption=None,
255
  image=None,
 
304
  path_textured = export_mesh(textured_mesh, save_folder, textured=True)
305
  model_viewer_html_textured = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH,
306
  textured=True)
307
+ if args.low_vram_mode:
308
+ torch.cuda.empty_cache()
309
  return (
310
  gr.update(value=path),
311
  gr.update(value=path_textured),
 
314
  seed,
315
  )
316
 
317
+
318
  def shape_generation(
319
  caption=None,
320
  image=None,
 
351
 
352
  path = export_mesh(mesh, save_folder, textured=False)
353
  model_viewer_html = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH)
354
+ if args.low_vram_mode:
355
+ torch.cuda.empty_cache()
356
  return (
357
  gr.update(value=path),
358
  model_viewer_html,
 
367
  title = 'Hunyuan3D-2mv: Image to 3D Generation with 1-4 Views'
368
  if 'mini' in args.subfolder:
369
  title = 'Hunyuan3D-2mini: Strong 0.6B Image to Shape Generator'
370
+ if TURBO_MODE:
371
+ title = title.replace(':', '-Turbo: Fast ')
372
 
373
  title_html = f"""
374
  <div style="font-size: 2em; font-weight: bold; text-align: center; margin-bottom: 5px">
 
393
  .mv-image button .wrap {
394
  font-size: 10px;
395
  }
396
+
397
  .mv-image .icon-wrap {
398
  width: 20px;
399
  }
400
+
401
  """
402
 
403
  with gr.Blocks(theme=gr.themes.Base(), title='Hunyuan-3D-2.0', analytics_enabled=False, css=custom_css) as demo:
 
437
  file_out = gr.File(label="File", visible=False)
438
  file_out2 = gr.File(label="File", visible=False)
439
 
440
+ with gr.Tabs(selected='tab_options' if TURBO_MODE else 'tab_export'):
441
+ with gr.Tab("Options", id='tab_options', visible=TURBO_MODE):
442
+ gen_mode = gr.Radio(label='Generation Mode',
443
+ info='Recommendation: Turbo for most cases, Fast for very complex cases, Standard seldom use.',
444
+ choices=['Turbo', 'Fast', 'Standard'], value='Turbo')
445
+ decode_mode = gr.Radio(label='Decoding Mode',
446
+ info='The resolution for exporting mesh from generated vectset',
447
+ choices=['Low', 'Standard', 'High'],
448
+ value='Standard')
449
  with gr.Tab('Advanced Options', id='tab_advanced_options'):
450
  with gr.Row():
451
  check_box_rembg = gr.Checkbox(value=True, label='Remove Background', min_width=100)
 
461
  with gr.Row():
462
  num_steps = gr.Slider(maximum=100,
463
  minimum=1,
464
+ value=5 if 'turbo' in args.subfolder else 30,
465
  step=1, label='Inference Steps')
466
  octree_resolution = gr.Slider(maximum=512, minimum=16, value=256, label='Octree Resolution')
467
  with gr.Row():
468
  cfg_scale = gr.Number(value=5.0, label='Guidance Scale', min_width=100)
469
+ num_chunks = gr.Slider(maximum=5000000, minimum=1000, value=8000,
470
  label='Number of Chunks', min_width=100)
 
471
  with gr.Tab("Export", id='tab_export'):
472
  with gr.Row():
473
  file_type = gr.Dropdown(label='File Type', choices=SUPPORTED_FORMATS,
 
587
  outputs=[tabs_output],
588
  )
589
 
590
+ def on_gen_mode_change(value):
591
+ if value == 'Turbo':
592
+ return gr.update(value=5)
593
+ elif value == 'Fast':
594
+ return gr.update(value=10)
595
+ else:
596
+ return gr.update(value=30)
597
+
598
+ gen_mode.change(on_gen_mode_change, inputs=[gen_mode], outputs=[num_steps])
599
+
600
+ def on_decode_mode_change(value):
601
+ if value == 'Low':
602
+ return gr.update(value=196)
603
+ elif value == 'Standard':
604
+ return gr.update(value=256)
605
+ else:
606
+ return gr.update(value=384)
607
+
608
+ decode_mode.change(on_decode_mode_change, inputs=[decode_mode], outputs=[octree_resolution])
609
+
610
  def on_export_click(file_out, file_out2, file_type, reduce_face, export_texture, target_face_num):
611
  if file_out is None:
612
  raise gr.Error('Please generate a mesh first.')
 
662
  parser.add_argument('--port', type=int, default=7860)
663
  parser.add_argument('--host', type=str, default='0.0.0.0')
664
  parser.add_argument('--device', type=str, default='cuda')
665
+ parser.add_argument('--mc_algo', type=str, default='mc')
666
  parser.add_argument('--cache-path', type=str, default='gradio_cache')
667
  parser.add_argument('--enable_t23d', action='store_true')
668
  parser.add_argument('--disable_tex', action='store_true')
669
+ parser.add_argument('--enable_flashvdm', action='store_true')
670
  parser.add_argument('--compile', action='store_true')
671
+ parser.add_argument('--low_vram_mode', action='store_true')
672
  args = parser.parse_args()
673
 
674
+ args.enable_flashvdm = True
675
  SAVE_DIR = args.cache_path
676
  os.makedirs(SAVE_DIR, exist_ok=True)
677
 
678
  CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
679
  MV_MODE = 'mv' in args.model_path
680
+ TURBO_MODE = 'turbo' in args.subfolder
681
 
682
  HTML_HEIGHT = 690 if MV_MODE else 650
683
  HTML_WIDTH = 500
 
700
  example_mvs = get_example_mv_list()
701
 
702
  SUPPORTED_FORMATS = ['glb', 'obj', 'ply', 'stl']
703
+
 
704
  HAS_TEXTUREGEN = False
705
  if not args.disable_tex:
706
  try:
707
  from hy3dgen.texgen import Hunyuan3DPaintPipeline
708
 
709
  texgen_worker = Hunyuan3DPaintPipeline.from_pretrained(args.texgen_model_path)
710
+ if args.low_vram_mode:
711
+ texgen_worker.enable_model_cpu_offload()
712
  # Not help much, ignore for now.
713
  # if args.compile:
714
  # texgen_worker.models['delight_model'].pipeline.unet.compile()
 
738
  i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
739
  args.model_path,
740
  subfolder=args.subfolder,
741
+ use_safetensors=True,
742
  device=args.device,
743
  )
744
+ if args.enable_flashvdm:
745
+ mc_algo = 'mc' if args.device in ['cpu', 'mps'] else args.mc_algo
746
+ i23d_worker.enable_flashvdm(mc_algo=mc_algo)
747
  if args.compile:
748
  i23d_worker.compile()
749
 
 
760
  app.mount("/static", StaticFiles(directory=static_dir, html=True), name="static")
761
  shutil.copytree('./assets/env_maps', os.path.join(static_dir, 'env_maps'), dirs_exist_ok=True)
762
 
763
+ if args.low_vram_mode:
764
+ torch.cuda.empty_cache()
765
  demo = build_app()
766
  app = gr.mount_gradio_app(app, demo, path="/")
767
+ uvicorn.run(app, host=args.host, port=args.port, workers=1)
hy3dgen/shapegen/__init__.py CHANGED
@@ -13,5 +13,5 @@
13
  # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
14
 
15
  from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
16
- from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover, MeshSimplifier
17
  from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
 
13
  # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
14
 
15
  from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
16
+ from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover
17
  from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
hy3dgen/shapegen/models/__init__.py CHANGED
@@ -25,4 +25,4 @@
25
 
26
  from .autoencoders import ShapeVAE
27
  from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
28
- from .denoisers import Hunyuan3DDiT
 
25
 
26
  from .autoencoders import ShapeVAE
27
  from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
28
+ from .denoisers import HunYuanDiTPlain, Hunyuan3DDiT
hy3dgen/shapegen/models/conditioner.py CHANGED
@@ -22,7 +22,6 @@
22
  # fine-tuning enabling code and other elements of the foregoing made publicly available
23
  # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
 
25
- import numpy as np
26
  import torch
27
  import torch.nn as nn
28
  from torchvision import transforms
@@ -34,26 +33,6 @@ from transformers import (
34
  )
35
 
36
 
37
- def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
38
- """
39
- embed_dim: output dimension for each position
40
- pos: a list of positions to be encoded: size (M,)
41
- out: (M, D)
42
- """
43
- assert embed_dim % 2 == 0
44
- omega = np.arange(embed_dim // 2, dtype=np.float64)
45
- omega /= embed_dim / 2.
46
- omega = 1. / 10000 ** omega # (D/2,)
47
-
48
- pos = pos.reshape(-1) # (M,)
49
- out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
50
-
51
- emb_sin = np.sin(out) # (M, D/2)
52
- emb_cos = np.cos(out) # (M, D/2)
53
-
54
- return np.concatenate([emb_sin, emb_cos], axis=1)
55
-
56
-
57
  class ImageEncoder(nn.Module):
58
  def __init__(
59
  self,
@@ -88,7 +67,7 @@ class ImageEncoder(nn.Module):
88
  ]
89
  )
90
 
91
- def forward(self, image, mask=None, value_range=(-1, 1), **kwargs):
92
  if value_range is not None:
93
  low, high = value_range
94
  image = (image - low) / (high - low)
@@ -103,7 +82,7 @@ class ImageEncoder(nn.Module):
103
 
104
  return last_hidden_state
105
 
106
- def unconditional_embedding(self, batch_size, **kwargs):
107
  device = next(self.model.parameters()).device
108
  dtype = next(self.model.parameters()).dtype
109
  zero = torch.zeros(
@@ -131,82 +110,11 @@ class DinoImageEncoder(ImageEncoder):
131
  std = [0.229, 0.224, 0.225]
132
 
133
 
134
- class DinoImageEncoderMV(DinoImageEncoder):
135
- def __init__(
136
- self,
137
- version=None,
138
- config=None,
139
- use_cls_token=True,
140
- image_size=224,
141
- view_num=4,
142
- **kwargs,
143
- ):
144
- super().__init__(version, config, use_cls_token, image_size, **kwargs)
145
- self.view_num = view_num
146
- self.num_patches = self.num_patches
147
- pos = np.arange(self.view_num, dtype=np.float32)
148
- view_embedding = torch.from_numpy(
149
- get_1d_sincos_pos_embed_from_grid(self.model.config.hidden_size, pos)).float()
150
-
151
- view_embedding = view_embedding.unsqueeze(1).repeat(1, self.num_patches, 1)
152
- self.view_embed = view_embedding.unsqueeze(0)
153
-
154
- def forward(self, image, mask=None, value_range=(-1, 1), view_idxs=None):
155
- if value_range is not None:
156
- low, high = value_range
157
- image = (image - low) / (high - low)
158
-
159
- image = image.to(self.model.device, dtype=self.model.dtype)
160
-
161
- bs, num_views, c, h, w = image.shape
162
- image = image.view(bs * num_views, c, h, w)
163
-
164
- inputs = self.transform(image)
165
- outputs = self.model(inputs)
166
-
167
- last_hidden_state = outputs.last_hidden_state
168
- last_hidden_state = last_hidden_state.view(
169
- bs, num_views, last_hidden_state.shape[-2],
170
- last_hidden_state.shape[-1]
171
- )
172
-
173
- view_embedding = self.view_embed.to(last_hidden_state.dtype).to(last_hidden_state.device)
174
- if view_idxs is not None:
175
- assert len(view_idxs) == bs
176
- view_embeddings = []
177
- for i in range(bs):
178
- view_idx = view_idxs[i]
179
- assert num_views == len(view_idx)
180
- view_embeddings.append(self.view_embed[:, view_idx, ...])
181
- view_embedding = torch.cat(view_embeddings, 0).to(last_hidden_state.dtype).to(last_hidden_state.device)
182
-
183
- if num_views != self.view_num:
184
- view_embedding = view_embedding[:, :num_views, ...]
185
- last_hidden_state = last_hidden_state + view_embedding
186
- last_hidden_state = last_hidden_state.view(bs, num_views * last_hidden_state.shape[-2],
187
- last_hidden_state.shape[-1])
188
- return last_hidden_state
189
-
190
- def unconditional_embedding(self, batch_size, view_idxs=None, **kwargs):
191
- device = next(self.model.parameters()).device
192
- dtype = next(self.model.parameters()).dtype
193
- zero = torch.zeros(
194
- batch_size,
195
- self.num_patches * len(view_idxs[0]),
196
- self.model.config.hidden_size,
197
- device=device,
198
- dtype=dtype,
199
- )
200
- return zero
201
-
202
-
203
  def build_image_encoder(config):
204
  if config['type'] == 'CLIPImageEncoder':
205
  return CLIPImageEncoder(**config['kwargs'])
206
  elif config['type'] == 'DinoImageEncoder':
207
  return DinoImageEncoder(**config['kwargs'])
208
- elif config['type'] == 'DinoImageEncoderMV':
209
- return DinoImageEncoderMV(**config['kwargs'])
210
  else:
211
  raise ValueError(f'Unknown image encoder type: {config["type"]}')
212
 
@@ -221,17 +129,17 @@ class DualImageEncoder(nn.Module):
221
  self.main_image_encoder = build_image_encoder(main_image_encoder)
222
  self.additional_image_encoder = build_image_encoder(additional_image_encoder)
223
 
224
- def forward(self, image, mask=None, **kwargs):
225
  outputs = {
226
- 'main': self.main_image_encoder(image, mask=mask, **kwargs),
227
- 'additional': self.additional_image_encoder(image, mask=mask, **kwargs),
228
  }
229
  return outputs
230
 
231
- def unconditional_embedding(self, batch_size, **kwargs):
232
  outputs = {
233
- 'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
234
- 'additional': self.additional_image_encoder.unconditional_embedding(batch_size, **kwargs),
235
  }
236
  return outputs
237
 
@@ -244,14 +152,14 @@ class SingleImageEncoder(nn.Module):
244
  super().__init__()
245
  self.main_image_encoder = build_image_encoder(main_image_encoder)
246
 
247
- def forward(self, image, mask=None, **kwargs):
248
  outputs = {
249
- 'main': self.main_image_encoder(image, mask=mask, **kwargs),
250
  }
251
  return outputs
252
 
253
- def unconditional_embedding(self, batch_size, **kwargs):
254
  outputs = {
255
- 'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
256
  }
257
  return outputs
 
22
  # fine-tuning enabling code and other elements of the foregoing made publicly available
23
  # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
 
 
25
  import torch
26
  import torch.nn as nn
27
  from torchvision import transforms
 
33
  )
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  class ImageEncoder(nn.Module):
37
  def __init__(
38
  self,
 
67
  ]
68
  )
69
 
70
+ def forward(self, image, mask=None, value_range=(-1, 1)):
71
  if value_range is not None:
72
  low, high = value_range
73
  image = (image - low) / (high - low)
 
82
 
83
  return last_hidden_state
84
 
85
+ def unconditional_embedding(self, batch_size):
86
  device = next(self.model.parameters()).device
87
  dtype = next(self.model.parameters()).dtype
88
  zero = torch.zeros(
 
110
  std = [0.229, 0.224, 0.225]
111
 
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  def build_image_encoder(config):
114
  if config['type'] == 'CLIPImageEncoder':
115
  return CLIPImageEncoder(**config['kwargs'])
116
  elif config['type'] == 'DinoImageEncoder':
117
  return DinoImageEncoder(**config['kwargs'])
 
 
118
  else:
119
  raise ValueError(f'Unknown image encoder type: {config["type"]}')
120
 
 
129
  self.main_image_encoder = build_image_encoder(main_image_encoder)
130
  self.additional_image_encoder = build_image_encoder(additional_image_encoder)
131
 
132
+ def forward(self, image, mask=None):
133
  outputs = {
134
+ 'main': self.main_image_encoder(image, mask=mask),
135
+ 'additional': self.additional_image_encoder(image, mask=mask),
136
  }
137
  return outputs
138
 
139
+ def unconditional_embedding(self, batch_size):
140
  outputs = {
141
+ 'main': self.main_image_encoder.unconditional_embedding(batch_size),
142
+ 'additional': self.additional_image_encoder.unconditional_embedding(batch_size),
143
  }
144
  return outputs
145
 
 
152
  super().__init__()
153
  self.main_image_encoder = build_image_encoder(main_image_encoder)
154
 
155
+ def forward(self, image, mask=None):
156
  outputs = {
157
+ 'main': self.main_image_encoder(image, mask=mask),
158
  }
159
  return outputs
160
 
161
+ def unconditional_embedding(self, batch_size):
162
  outputs = {
163
+ 'main': self.main_image_encoder.unconditional_embedding(batch_size),
164
  }
165
  return outputs
hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py CHANGED
@@ -60,15 +60,6 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
60
  return embedding
61
 
62
 
63
- class GELU(nn.Module):
64
- def __init__(self, approximate='tanh'):
65
- super().__init__()
66
- self.approximate = approximate
67
-
68
- def forward(self, x: Tensor) -> Tensor:
69
- return nn.functional.gelu(x.contiguous(), approximate=self.approximate)
70
-
71
-
72
  class MLPEmbedder(nn.Module):
73
  def __init__(self, in_dim: int, hidden_dim: int):
74
  super().__init__()
@@ -171,7 +162,7 @@ class DoubleStreamBlock(nn.Module):
171
  self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
172
  self.img_mlp = nn.Sequential(
173
  nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
174
- GELU(approximate="tanh"),
175
  nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
176
  )
177
 
@@ -182,7 +173,7 @@ class DoubleStreamBlock(nn.Module):
182
  self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
183
  self.txt_mlp = nn.Sequential(
184
  nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
185
- GELU(approximate="tanh"),
186
  nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
187
  )
188
 
@@ -248,7 +239,7 @@ class SingleStreamBlock(nn.Module):
248
  self.hidden_size = hidden_size
249
  self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
250
 
251
- self.mlp_act = GELU(approximate="tanh")
252
  self.modulation = Modulation(hidden_size, double=False)
253
 
254
  def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
 
60
  return embedding
61
 
62
 
 
 
 
 
 
 
 
 
 
63
  class MLPEmbedder(nn.Module):
64
  def __init__(self, in_dim: int, hidden_dim: int):
65
  super().__init__()
 
162
  self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
163
  self.img_mlp = nn.Sequential(
164
  nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
165
+ nn.GELU(approximate="tanh"),
166
  nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
167
  )
168
 
 
173
  self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
174
  self.txt_mlp = nn.Sequential(
175
  nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
176
+ nn.GELU(approximate="tanh"),
177
  nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
178
  )
179
 
 
239
  self.hidden_size = hidden_size
240
  self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
241
 
242
+ self.mlp_act = nn.GELU(approximate="tanh")
243
  self.modulation = Modulation(hidden_size, double=False)
244
 
245
  def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
hy3dgen/shapegen/pipelines.py CHANGED
@@ -24,12 +24,11 @@ import trimesh
24
  import yaml
25
  from PIL import Image
26
  from diffusers.utils.torch_utils import randn_tensor
27
- from diffusers.utils.import_utils import is_accelerate_version, is_accelerate_available
28
  from tqdm import tqdm
29
 
30
  from .models.autoencoders import ShapeVAE
31
  from .models.autoencoders import SurfaceExtractors
32
- from .utils import logger, synchronize_timer, smart_load_model
33
 
34
 
35
  def retrieve_timesteps(
@@ -128,9 +127,6 @@ def instantiate_from_config(config, **kwargs):
128
 
129
 
130
  class Hunyuan3DDiTPipeline:
131
- model_cpu_offload_seq = "conditioner->model->vae"
132
- _exclude_from_cpu_offload = []
133
-
134
  @classmethod
135
  @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
136
  def from_single_file(
@@ -211,12 +207,34 @@ class Hunyuan3DDiTPipeline:
211
  dtype=dtype,
212
  device=device,
213
  )
214
- config_path, ckpt_path = smart_load_model(
215
- model_path,
216
- subfolder=subfolder,
217
- use_safetensors=use_safetensors,
218
- variant=variant
219
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  return cls.from_single_file(
221
  ckpt_path,
222
  config_path,
@@ -261,18 +279,12 @@ class Hunyuan3DDiTPipeline:
261
  if enabled:
262
  model_path = self.kwargs['from_pretrained_kwargs']['model_path']
263
  turbo_vae_mapping = {
264
- 'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
265
- 'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
266
- 'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini-turbo'),
267
  }
268
  model_name = model_path.split('/')[-1]
269
  if replace_vae and model_name in turbo_vae_mapping:
270
- model_path, subfolder = turbo_vae_mapping[model_name]
271
- self.vae = ShapeVAE.from_pretrained(
272
- model_path, subfolder=subfolder,
273
- use_safetensors=self.kwargs['from_pretrained_kwargs']['use_safetensors'],
274
- device=self.device,
275
- )
276
  self.vae.enable_flashvdm_decoder(
277
  enabled=enabled,
278
  adaptive_kv_selection=adaptive_kv_selection,
@@ -282,146 +294,33 @@ class Hunyuan3DDiTPipeline:
282
  else:
283
  model_path = self.kwargs['from_pretrained_kwargs']['model_path']
284
  vae_mapping = {
285
- 'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
286
- 'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
287
- 'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini'),
288
  }
289
  model_name = model_path.split('/')[-1]
290
  if model_name in vae_mapping:
291
- model_path, subfolder = vae_mapping[model_name]
292
- self.vae = ShapeVAE.from_pretrained(model_path, subfolder=subfolder)
293
  self.vae.enable_flashvdm_decoder(enabled=False)
294
 
295
  def to(self, device=None, dtype=None):
296
- if dtype is not None:
297
- self.dtype = dtype
298
- self.vae.to(dtype=dtype)
299
- self.model.to(dtype=dtype)
300
- self.conditioner.to(dtype=dtype)
301
  if device is not None:
302
  self.device = torch.device(device)
303
  self.vae.to(device)
304
  self.model.to(device)
305
  self.conditioner.to(device)
306
-
307
- @property
308
- def _execution_device(self):
309
- r"""
310
- Returns the device on which the pipeline's models will be executed. After calling
311
- [`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
312
- Accelerate's module hooks.
313
- """
314
- for name, model in self.components.items():
315
- if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload:
316
- continue
317
-
318
- if not hasattr(model, "_hf_hook"):
319
- return self.device
320
- for module in model.modules():
321
- if (
322
- hasattr(module, "_hf_hook")
323
- and hasattr(module._hf_hook, "execution_device")
324
- and module._hf_hook.execution_device is not None
325
- ):
326
- return torch.device(module._hf_hook.execution_device)
327
- return self.device
328
-
329
- def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
330
- r"""
331
- Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
332
- to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
333
- method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
334
- `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
335
-
336
- Arguments:
337
- gpu_id (`int`, *optional*):
338
- The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
339
- device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
340
- The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
341
- default to "cuda".
342
- """
343
- if self.model_cpu_offload_seq is None:
344
- raise ValueError(
345
- "Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
346
- )
347
-
348
- if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
349
- from accelerate import cpu_offload_with_hook
350
- else:
351
- raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
352
-
353
- torch_device = torch.device(device)
354
- device_index = torch_device.index
355
-
356
- if gpu_id is not None and device_index is not None:
357
- raise ValueError(
358
- f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
359
- f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
360
- )
361
-
362
- # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
363
- self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
364
-
365
- device_type = torch_device.type
366
- device = torch.device(f"{device_type}:{self._offload_gpu_id}")
367
-
368
- if self.device.type != "cpu":
369
- self.to("cpu")
370
- device_mod = getattr(torch, self.device.type, None)
371
- if hasattr(device_mod, "empty_cache") and device_mod.is_available():
372
- device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
373
-
374
- all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
375
-
376
- self._all_hooks = []
377
- hook = None
378
- for model_str in self.model_cpu_offload_seq.split("->"):
379
- model = all_model_components.pop(model_str, None)
380
- if not isinstance(model, torch.nn.Module):
381
- continue
382
-
383
- _, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
384
- self._all_hooks.append(hook)
385
-
386
- # CPU offload models that are not in the seq chain unless they are explicitly excluded
387
- # these models will stay on CPU until maybe_free_model_hooks is called
388
- # some models cannot be in the seq chain because they are iteratively called, such as controlnet
389
- for name, model in all_model_components.items():
390
- if not isinstance(model, torch.nn.Module):
391
- continue
392
-
393
- if name in self._exclude_from_cpu_offload:
394
- model.to(device)
395
- else:
396
- _, hook = cpu_offload_with_hook(model, device)
397
- self._all_hooks.append(hook)
398
-
399
- def maybe_free_model_hooks(self):
400
- r"""
401
- Function that offloads all components, removes all model hooks that were added when using
402
- `enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function
403
- is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it
404
- functions correctly when applying enable_model_cpu_offload.
405
- """
406
- if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
407
- # `enable_model_cpu_offload` has not be called, so silently do nothing
408
- return
409
-
410
- for hook in self._all_hooks:
411
- # offload model and remove hook from model
412
- hook.offload()
413
- hook.remove()
414
-
415
- # make sure the model is in the same state as before calling it
416
- self.enable_model_cpu_offload()
417
 
418
  @synchronize_timer('Encode cond')
419
- def encode_cond(self, image, additional_cond_inputs, do_classifier_free_guidance, dual_guidance):
420
  bsz = image.shape[0]
421
- cond = self.conditioner(image=image, **additional_cond_inputs)
422
 
423
  if do_classifier_free_guidance:
424
- un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
425
 
426
  if dual_guidance:
427
  un_cond_drop_main = copy.deepcopy(un_cond)
@@ -437,6 +336,8 @@ class Hunyuan3DDiTPipeline:
437
 
438
  cond = cat_recursive(cond, un_cond_drop_main, un_cond)
439
  else:
 
 
440
  def cat_recursive(a, b):
441
  if isinstance(a, torch.Tensor):
442
  return torch.cat([a, b], dim=0).to(self.dtype)
@@ -482,27 +383,25 @@ class Hunyuan3DDiTPipeline:
482
  latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
483
  return latents
484
 
485
- def prepare_image(self, image) -> dict:
486
  if isinstance(image, str) and not os.path.exists(image):
487
  raise FileNotFoundError(f"Couldn't find image at path {image}")
488
 
489
  if not isinstance(image, list):
490
  image = [image]
491
-
492
- outputs = []
493
  for img in image:
494
- output = self.image_processor(img)
495
- outputs.append(output)
496
-
497
- cond_input = {k: [] for k in outputs[0].keys()}
498
- for output in outputs:
499
- for key, value in output.items():
500
- cond_input[key].append(value)
501
- for key, value in cond_input.items():
502
- if isinstance(value[0], torch.Tensor):
503
- cond_input[key] = torch.cat(value, dim=0)
504
 
505
- return cond_input
 
 
 
 
 
506
 
507
  def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
508
  """
@@ -575,14 +474,10 @@ class Hunyuan3DDiTPipeline:
575
  getattr(self.model, 'guidance_cond_proj_dim', None) is None
576
  dual_guidance = dual_guidance_scale >= 0 and dual_guidance
577
 
578
- cond_inputs = self.prepare_image(image)
579
- image = cond_inputs.pop('image')
580
- cond = self.encode_cond(
581
- image=image,
582
- additional_cond_inputs=cond_inputs,
583
- do_classifier_free_guidance=do_classifier_free_guidance,
584
- dual_guidance=False,
585
- )
586
  batch_size = image.shape[0]
587
 
588
  t_dtype = torch.long
@@ -640,17 +535,7 @@ class Hunyuan3DDiTPipeline:
640
  box_v, mc_level, num_chunks, octree_resolution, mc_algo,
641
  )
642
 
643
- def _export(
644
- self,
645
- latents,
646
- output_type='trimesh',
647
- box_v=1.01,
648
- mc_level=0.0,
649
- num_chunks=20000,
650
- octree_resolution=256,
651
- mc_algo='mc',
652
- enable_pbar=True
653
- ):
654
  if not output_type == "latent":
655
  latents = 1. / self.vae.scale_factor * latents
656
  latents = self.vae(latents)
@@ -677,7 +562,7 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
677
  @torch.inference_mode()
678
  def __call__(
679
  self,
680
- image: Union[str, List[str], Image.Image, dict, List[dict]] = None,
681
  num_inference_steps: int = 50,
682
  timesteps: List[int] = None,
683
  sigmas: List[float] = None,
@@ -705,11 +590,10 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
705
  self.model.guidance_embed is True
706
  )
707
 
708
- cond_inputs = self.prepare_image(image)
709
- image = cond_inputs.pop('image')
710
  cond = self.encode_cond(
711
  image=image,
712
- additional_cond_inputs=cond_inputs,
713
  do_classifier_free_guidance=do_classifier_free_guidance,
714
  dual_guidance=False,
715
  )
 
24
  import yaml
25
  from PIL import Image
26
  from diffusers.utils.torch_utils import randn_tensor
 
27
  from tqdm import tqdm
28
 
29
  from .models.autoencoders import ShapeVAE
30
  from .models.autoencoders import SurfaceExtractors
31
+ from .utils import logger, synchronize_timer
32
 
33
 
34
  def retrieve_timesteps(
 
127
 
128
 
129
  class Hunyuan3DDiTPipeline:
 
 
 
130
  @classmethod
131
  @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
132
  def from_single_file(
 
207
  dtype=dtype,
208
  device=device,
209
  )
210
+ original_model_path = model_path
211
+ # try local path
212
+ base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
213
+ model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder))
214
+ logger.info(f'Try to load model from local path: {model_path}')
215
+ if not os.path.exists(model_path):
216
+ logger.info('Model path not exists, try to download from huggingface')
217
+ try:
218
+ import huggingface_hub
219
+ # download from huggingface
220
+ path = huggingface_hub.snapshot_download(repo_id=original_model_path)
221
+ model_path = os.path.join(path, subfolder)
222
+ except ImportError:
223
+ logger.warning(
224
+ "You need to install HuggingFace Hub to load models from the hub."
225
+ )
226
+ raise RuntimeError(f"Model path {model_path} not found")
227
+ except Exception as e:
228
+ raise e
229
+
230
+ if not os.path.exists(model_path):
231
+ raise FileNotFoundError(f"Model path {original_model_path} not found")
232
+
233
+ extension = 'ckpt' if not use_safetensors else 'safetensors'
234
+ variant = '' if variant is None else f'.{variant}'
235
+ ckpt_name = f'model{variant}.{extension}'
236
+ config_path = os.path.join(model_path, 'config.yaml')
237
+ ckpt_path = os.path.join(model_path, ckpt_name)
238
  return cls.from_single_file(
239
  ckpt_path,
240
  config_path,
 
279
  if enabled:
280
  model_path = self.kwargs['from_pretrained_kwargs']['model_path']
281
  turbo_vae_mapping = {
282
+ 'Hunyuan3D-2': 'hunyuan3d-vae-v2-0-turbo',
283
+ 'Hunyuan3D-2s': 'hunyuan3d-vae-v2-s-turbo'
 
284
  }
285
  model_name = model_path.split('/')[-1]
286
  if replace_vae and model_name in turbo_vae_mapping:
287
+ self.vae = ShapeVAE.from_pretrained(model_path, subfolder=turbo_vae_mapping[model_name])
 
 
 
 
 
288
  self.vae.enable_flashvdm_decoder(
289
  enabled=enabled,
290
  adaptive_kv_selection=adaptive_kv_selection,
 
294
  else:
295
  model_path = self.kwargs['from_pretrained_kwargs']['model_path']
296
  vae_mapping = {
297
+ 'Hunyuan3D-2': 'hunyuan3d-vae-v2-0',
298
+ 'Hunyuan3D-2s': 'hunyuan3d-vae-v2-s'
 
299
  }
300
  model_name = model_path.split('/')[-1]
301
  if model_name in vae_mapping:
302
+ self.vae = ShapeVAE.from_pretrained(model_path, subfolder=vae_mapping[model_name])
 
303
  self.vae.enable_flashvdm_decoder(enabled=False)
304
 
305
  def to(self, device=None, dtype=None):
 
 
 
 
 
306
  if device is not None:
307
  self.device = torch.device(device)
308
  self.vae.to(device)
309
  self.model.to(device)
310
  self.conditioner.to(device)
311
+ if dtype is not None:
312
+ self.dtype = dtype
313
+ self.vae.to(dtype=dtype)
314
+ self.model.to(dtype=dtype)
315
+ self.conditioner.to(dtype=dtype)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
  @synchronize_timer('Encode cond')
318
+ def encode_cond(self, image, mask, do_classifier_free_guidance, dual_guidance):
319
  bsz = image.shape[0]
320
+ cond = self.conditioner(image=image, mask=mask)
321
 
322
  if do_classifier_free_guidance:
323
+ un_cond = self.conditioner.unconditional_embedding(bsz)
324
 
325
  if dual_guidance:
326
  un_cond_drop_main = copy.deepcopy(un_cond)
 
336
 
337
  cond = cat_recursive(cond, un_cond_drop_main, un_cond)
338
  else:
339
+ un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
340
+
341
  def cat_recursive(a, b):
342
  if isinstance(a, torch.Tensor):
343
  return torch.cat([a, b], dim=0).to(self.dtype)
 
383
  latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
384
  return latents
385
 
386
+ def prepare_image(self, image):
387
  if isinstance(image, str) and not os.path.exists(image):
388
  raise FileNotFoundError(f"Couldn't find image at path {image}")
389
 
390
  if not isinstance(image, list):
391
  image = [image]
392
+ image_pts = []
393
+ mask_pts = []
394
  for img in image:
395
+ image_pt, mask_pt = self.image_processor(img, return_mask=True)
396
+ image_pts.append(image_pt)
397
+ mask_pts.append(mask_pt)
 
 
 
 
 
 
 
398
 
399
+ image_pts = torch.cat(image_pts, dim=0).to(self.device, dtype=self.dtype)
400
+ if mask_pts[0] is not None:
401
+ mask_pts = torch.cat(mask_pts, dim=0).to(self.device, dtype=self.dtype)
402
+ else:
403
+ mask_pts = None
404
+ return image_pts, mask_pts
405
 
406
  def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
407
  """
 
474
  getattr(self.model, 'guidance_cond_proj_dim', None) is None
475
  dual_guidance = dual_guidance_scale >= 0 and dual_guidance
476
 
477
+ image, mask = self.prepare_image(image)
478
+ cond = self.encode_cond(image=image,
479
+ do_classifier_free_guidance=do_classifier_free_guidance,
480
+ dual_guidance=dual_guidance)
 
 
 
 
481
  batch_size = image.shape[0]
482
 
483
  t_dtype = torch.long
 
535
  box_v, mc_level, num_chunks, octree_resolution, mc_algo,
536
  )
537
 
538
+ def _export(self, latents, output_type, box_v, mc_level, num_chunks, octree_resolution, mc_algo, enable_pbar=True):
 
 
 
 
 
 
 
 
 
 
539
  if not output_type == "latent":
540
  latents = 1. / self.vae.scale_factor * latents
541
  latents = self.vae(latents)
 
562
  @torch.inference_mode()
563
  def __call__(
564
  self,
565
+ image: Union[str, List[str], Image.Image] = None,
566
  num_inference_steps: int = 50,
567
  timesteps: List[int] = None,
568
  sigmas: List[float] = None,
 
590
  self.model.guidance_embed is True
591
  )
592
 
593
+ image, mask = self.prepare_image(image)
 
594
  cond = self.encode_cond(
595
  image=image,
596
+ mask=mask,
597
  do_classifier_free_guidance=do_classifier_free_guidance,
598
  dual_guidance=False,
599
  )
hy3dgen/shapegen/postprocessors.py CHANGED
@@ -12,16 +12,13 @@
12
  # fine-tuning enabling code and other elements of the foregoing made publicly available
13
  # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
14
 
15
- import os
16
  import tempfile
17
  from typing import Union
18
 
19
- import numpy as np
20
  import pymeshlab
21
- import torch
22
  import trimesh
23
 
24
- from .models.autoencoders import Latent2MeshOutput
25
  from .utils import synchronize_timer
26
 
27
 
 
12
  # fine-tuning enabling code and other elements of the foregoing made publicly available
13
  # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
14
 
 
15
  import tempfile
16
  from typing import Union
17
 
 
18
  import pymeshlab
 
19
  import trimesh
20
 
21
+ from .models.vae import Latent2MeshOutput
22
  from .utils import synchronize_timer
23
 
24
 
hy3dgen/shapegen/preprocessors.py CHANGED
@@ -87,7 +87,9 @@ class ImageProcessorV2:
87
  mask = mask.clip(0, 255).astype(np.uint8)
88
  return result, mask
89
 
90
- def load_image(self, image, border_ratio=0.15, to_tensor=True):
 
 
91
  if isinstance(image, str):
92
  image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
93
  image, mask = self.recenter(image, border_ratio=border_ratio)
@@ -104,64 +106,13 @@ class ImageProcessorV2:
104
  if to_tensor:
105
  image = array_to_tensor(image)
106
  mask = array_to_tensor(mask)
107
- return image, mask
108
-
109
- def __call__(self, image, border_ratio=0.15, to_tensor=True, **kwargs):
110
- if self.border_ratio is not None:
111
- border_ratio = self.border_ratio
112
- image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
113
- outputs = {
114
- 'image': image,
115
- 'mask': mask
116
- }
117
- return outputs
118
-
119
-
120
- class MVImageProcessorV2(ImageProcessorV2):
121
- """
122
- view order: front, front clockwise 90, back, front clockwise 270
123
- """
124
- return_view_idx = True
125
-
126
- def __init__(self, size=512, border_ratio=None):
127
- super().__init__(size, border_ratio)
128
- self.view2idx = {
129
- 'front': 0,
130
- 'left': 1,
131
- 'back': 2,
132
- 'right': 3
133
- }
134
-
135
- def __call__(self, image_dict, border_ratio=0.15, to_tensor=True, **kwargs):
136
- if self.border_ratio is not None:
137
- border_ratio = self.border_ratio
138
-
139
- images = []
140
- masks = []
141
- view_idxs = []
142
- for idx, (view_tag, image) in enumerate(image_dict.items()):
143
- view_idxs.append(self.view2idx[view_tag])
144
- image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
145
- images.append(image)
146
- masks.append(mask)
147
-
148
- zipped_lists = zip(view_idxs, images, masks)
149
- sorted_zipped_lists = sorted(zipped_lists)
150
- view_idxs, images, masks = zip(*sorted_zipped_lists)
151
-
152
- image = torch.cat(images, 0).unsqueeze(0)
153
- mask = torch.cat(masks, 0).unsqueeze(0)
154
- outputs = {
155
- 'image': image,
156
- 'mask': mask,
157
- 'view_idxs': view_idxs
158
- }
159
- return outputs
160
 
161
 
162
  IMAGE_PROCESSORS = {
163
  "v2": ImageProcessorV2,
164
- 'mv_v2': MVImageProcessorV2,
165
  }
166
 
167
  DEFAULT_IMAGEPROCESSOR = 'v2'
 
87
  mask = mask.clip(0, 255).astype(np.uint8)
88
  return result, mask
89
 
90
+ def __call__(self, image, border_ratio=0.15, to_tensor=True, return_mask=False, **kwargs):
91
+ if self.border_ratio is not None:
92
+ border_ratio = self.border_ratio
93
  if isinstance(image, str):
94
  image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
95
  image, mask = self.recenter(image, border_ratio=border_ratio)
 
106
  if to_tensor:
107
  image = array_to_tensor(image)
108
  mask = array_to_tensor(mask)
109
+ if return_mask:
110
+ return image, mask
111
+ return image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
 
114
  IMAGE_PROCESSORS = {
115
  "v2": ImageProcessorV2,
 
116
  }
117
 
118
  DEFAULT_IMAGEPROCESSOR = 'v2'