Spaces:
Running
on
Zero
Running
on
Zero
Merge branch 'main' of https://huggingface.co/spaces/tencent/Hunyuan3D-2mv
Browse files- gradio_app.py +74 -30
- hy3dgen/shapegen/__init__.py +1 -1
- hy3dgen/shapegen/models/__init__.py +1 -1
- hy3dgen/shapegen/models/conditioner.py +12 -104
- hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py +3 -12
- hy3dgen/shapegen/pipelines.py +65 -181
- hy3dgen/shapegen/postprocessors.py +1 -4
- hy3dgen/shapegen/preprocessors.py +6 -55
gradio_app.py
CHANGED
@@ -1,10 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import random
|
3 |
import shutil
|
4 |
import time
|
5 |
from glob import glob
|
6 |
from pathlib import Path
|
7 |
-
import uuid
|
8 |
|
9 |
import gradio as gr
|
10 |
import torch
|
@@ -12,6 +25,7 @@ import trimesh
|
|
12 |
import uvicorn
|
13 |
from fastapi import FastAPI
|
14 |
from fastapi.staticfiles import StaticFiles
|
|
|
15 |
|
16 |
from hy3dgen.shapegen.utils import logger
|
17 |
|
@@ -28,6 +42,7 @@ if True:
|
|
28 |
print('install custom')
|
29 |
subprocess.run(shlex.split("pip install custom_rasterizer-0.1-cp310-cp310-linux_x86_64.whl"), check=True)
|
30 |
|
|
|
31 |
def get_example_img_list():
|
32 |
print('Loading example img list ...')
|
33 |
return sorted(glob('./assets/example_images/**/*.png', recursive=True))
|
@@ -47,7 +62,7 @@ def get_example_mv_list():
|
|
47 |
root = './assets/example_mv_images'
|
48 |
for mv_dir in os.listdir(root):
|
49 |
view_list = []
|
50 |
-
for view in ['
|
51 |
path = os.path.join(root, mv_dir, f'{view}.png')
|
52 |
if os.path.exists(path):
|
53 |
view_list.append(path)
|
@@ -57,18 +72,6 @@ def get_example_mv_list():
|
|
57 |
return mv_list
|
58 |
|
59 |
|
60 |
-
# def gen_save_folder(max_size=60):
|
61 |
-
# os.makedirs(SAVE_DIR, exist_ok=True)
|
62 |
-
# exists = set(int(_) for _ in os.listdir(SAVE_DIR) if _.isdigit())
|
63 |
-
# cur_id = min(set(range(max_size)) - exists) if len(exists) < max_size else -1
|
64 |
-
# if os.path.exists(f"{SAVE_DIR}/{(cur_id + 1) % max_size}"):
|
65 |
-
# shutil.rmtree(f"{SAVE_DIR}/{(cur_id + 1) % max_size}")
|
66 |
-
# print(f"remove {SAVE_DIR}/{(cur_id + 1) % max_size} success !!!")
|
67 |
-
# save_folder = f"{SAVE_DIR}/{max(0, cur_id)}"
|
68 |
-
# os.makedirs(save_folder, exist_ok=True)
|
69 |
-
# print(f"mkdir {save_folder} suceess !!!")
|
70 |
-
# return save_folder
|
71 |
-
|
72 |
def gen_save_folder(max_size=200):
|
73 |
os.makedirs(SAVE_DIR, exist_ok=True)
|
74 |
|
@@ -139,7 +142,7 @@ def build_model_viewer_html(save_folder, height=660, width=790, textured=False):
|
|
139 |
</div>
|
140 |
"""
|
141 |
|
142 |
-
|
143 |
def _gen_shape(
|
144 |
caption=None,
|
145 |
image=None,
|
@@ -246,7 +249,7 @@ def _gen_shape(
|
|
246 |
main_image = image if not MV_MODE else image['front']
|
247 |
return mesh, main_image, save_folder, stats, seed
|
248 |
|
249 |
-
|
250 |
def generation_all(
|
251 |
caption=None,
|
252 |
image=None,
|
@@ -301,7 +304,8 @@ def generation_all(
|
|
301 |
path_textured = export_mesh(textured_mesh, save_folder, textured=True)
|
302 |
model_viewer_html_textured = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH,
|
303 |
textured=True)
|
304 |
-
|
|
|
305 |
return (
|
306 |
gr.update(value=path),
|
307 |
gr.update(value=path_textured),
|
@@ -310,7 +314,7 @@ def generation_all(
|
|
310 |
seed,
|
311 |
)
|
312 |
|
313 |
-
|
314 |
def shape_generation(
|
315 |
caption=None,
|
316 |
image=None,
|
@@ -347,7 +351,8 @@ def shape_generation(
|
|
347 |
|
348 |
path = export_mesh(mesh, save_folder, textured=False)
|
349 |
model_viewer_html = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH)
|
350 |
-
|
|
|
351 |
return (
|
352 |
gr.update(value=path),
|
353 |
model_viewer_html,
|
@@ -362,6 +367,8 @@ def build_app():
|
|
362 |
title = 'Hunyuan3D-2mv: Image to 3D Generation with 1-4 Views'
|
363 |
if 'mini' in args.subfolder:
|
364 |
title = 'Hunyuan3D-2mini: Strong 0.6B Image to Shape Generator'
|
|
|
|
|
365 |
|
366 |
title_html = f"""
|
367 |
<div style="font-size: 2em; font-weight: bold; text-align: center; margin-bottom: 5px">
|
@@ -386,11 +393,11 @@ def build_app():
|
|
386 |
.mv-image button .wrap {
|
387 |
font-size: 10px;
|
388 |
}
|
389 |
-
|
390 |
.mv-image .icon-wrap {
|
391 |
width: 20px;
|
392 |
}
|
393 |
-
|
394 |
"""
|
395 |
|
396 |
with gr.Blocks(theme=gr.themes.Base(), title='Hunyuan-3D-2.0', analytics_enabled=False, css=custom_css) as demo:
|
@@ -430,7 +437,15 @@ def build_app():
|
|
430 |
file_out = gr.File(label="File", visible=False)
|
431 |
file_out2 = gr.File(label="File", visible=False)
|
432 |
|
433 |
-
with gr.Tabs(selected='tab_export'):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
434 |
with gr.Tab('Advanced Options', id='tab_advanced_options'):
|
435 |
with gr.Row():
|
436 |
check_box_rembg = gr.Checkbox(value=True, label='Remove Background', min_width=100)
|
@@ -446,14 +461,13 @@ def build_app():
|
|
446 |
with gr.Row():
|
447 |
num_steps = gr.Slider(maximum=100,
|
448 |
minimum=1,
|
449 |
-
value=30,
|
450 |
step=1, label='Inference Steps')
|
451 |
octree_resolution = gr.Slider(maximum=512, minimum=16, value=256, label='Octree Resolution')
|
452 |
with gr.Row():
|
453 |
cfg_scale = gr.Number(value=5.0, label='Guidance Scale', min_width=100)
|
454 |
-
num_chunks = gr.Slider(maximum=5000000, minimum=1000, value=
|
455 |
label='Number of Chunks', min_width=100)
|
456 |
-
|
457 |
with gr.Tab("Export", id='tab_export'):
|
458 |
with gr.Row():
|
459 |
file_type = gr.Dropdown(label='File Type', choices=SUPPORTED_FORMATS,
|
@@ -573,6 +587,26 @@ def build_app():
|
|
573 |
outputs=[tabs_output],
|
574 |
)
|
575 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
576 |
def on_export_click(file_out, file_out2, file_type, reduce_face, export_texture, target_face_num):
|
577 |
if file_out is None:
|
578 |
raise gr.Error('Please generate a mesh first.')
|
@@ -628,18 +662,22 @@ if __name__ == '__main__':
|
|
628 |
parser.add_argument('--port', type=int, default=7860)
|
629 |
parser.add_argument('--host', type=str, default='0.0.0.0')
|
630 |
parser.add_argument('--device', type=str, default='cuda')
|
631 |
-
parser.add_argument('--mc_algo', type=str, default='
|
632 |
parser.add_argument('--cache-path', type=str, default='gradio_cache')
|
633 |
parser.add_argument('--enable_t23d', action='store_true')
|
634 |
parser.add_argument('--disable_tex', action='store_true')
|
|
|
635 |
parser.add_argument('--compile', action='store_true')
|
|
|
636 |
args = parser.parse_args()
|
637 |
|
|
|
638 |
SAVE_DIR = args.cache_path
|
639 |
os.makedirs(SAVE_DIR, exist_ok=True)
|
640 |
|
641 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
642 |
MV_MODE = 'mv' in args.model_path
|
|
|
643 |
|
644 |
HTML_HEIGHT = 690 if MV_MODE else 650
|
645 |
HTML_WIDTH = 500
|
@@ -662,14 +700,15 @@ if __name__ == '__main__':
|
|
662 |
example_mvs = get_example_mv_list()
|
663 |
|
664 |
SUPPORTED_FORMATS = ['glb', 'obj', 'ply', 'stl']
|
665 |
-
|
666 |
-
args.disable_tex = True
|
667 |
HAS_TEXTUREGEN = False
|
668 |
if not args.disable_tex:
|
669 |
try:
|
670 |
from hy3dgen.texgen import Hunyuan3DPaintPipeline
|
671 |
|
672 |
texgen_worker = Hunyuan3DPaintPipeline.from_pretrained(args.texgen_model_path)
|
|
|
|
|
673 |
# Not help much, ignore for now.
|
674 |
# if args.compile:
|
675 |
# texgen_worker.models['delight_model'].pipeline.unet.compile()
|
@@ -699,9 +738,12 @@ if __name__ == '__main__':
|
|
699 |
i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
|
700 |
args.model_path,
|
701 |
subfolder=args.subfolder,
|
702 |
-
use_safetensors=
|
703 |
device=args.device,
|
704 |
)
|
|
|
|
|
|
|
705 |
if args.compile:
|
706 |
i23d_worker.compile()
|
707 |
|
@@ -718,6 +760,8 @@ if __name__ == '__main__':
|
|
718 |
app.mount("/static", StaticFiles(directory=static_dir, html=True), name="static")
|
719 |
shutil.copytree('./assets/env_maps', os.path.join(static_dir, 'env_maps'), dirs_exist_ok=True)
|
720 |
|
|
|
|
|
721 |
demo = build_app()
|
722 |
app = gr.mount_gradio_app(app, demo, path="/")
|
723 |
-
uvicorn.run(app, host=args.host, port=args.port)
|
|
|
1 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
2 |
+
# except for the third-party components listed below.
|
3 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
4 |
+
# in the repsective licenses of these third-party components.
|
5 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
6 |
+
# components and must ensure that the usage of the third party components adheres to
|
7 |
+
# all relevant laws and regulations.
|
8 |
+
|
9 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
10 |
+
# their software and algorithms, including trained model weights, parameters (including
|
11 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
12 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
13 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
14 |
+
|
15 |
import os
|
16 |
import random
|
17 |
import shutil
|
18 |
import time
|
19 |
from glob import glob
|
20 |
from pathlib import Path
|
|
|
21 |
|
22 |
import gradio as gr
|
23 |
import torch
|
|
|
25 |
import uvicorn
|
26 |
from fastapi import FastAPI
|
27 |
from fastapi.staticfiles import StaticFiles
|
28 |
+
import uuid
|
29 |
|
30 |
from hy3dgen.shapegen.utils import logger
|
31 |
|
|
|
42 |
print('install custom')
|
43 |
subprocess.run(shlex.split("pip install custom_rasterizer-0.1-cp310-cp310-linux_x86_64.whl"), check=True)
|
44 |
|
45 |
+
|
46 |
def get_example_img_list():
|
47 |
print('Loading example img list ...')
|
48 |
return sorted(glob('./assets/example_images/**/*.png', recursive=True))
|
|
|
62 |
root = './assets/example_mv_images'
|
63 |
for mv_dir in os.listdir(root):
|
64 |
view_list = []
|
65 |
+
for view in ['front', 'back', 'left', 'right']:
|
66 |
path = os.path.join(root, mv_dir, f'{view}.png')
|
67 |
if os.path.exists(path):
|
68 |
view_list.append(path)
|
|
|
72 |
return mv_list
|
73 |
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
def gen_save_folder(max_size=200):
|
76 |
os.makedirs(SAVE_DIR, exist_ok=True)
|
77 |
|
|
|
142 |
</div>
|
143 |
"""
|
144 |
|
145 |
+
|
146 |
def _gen_shape(
|
147 |
caption=None,
|
148 |
image=None,
|
|
|
249 |
main_image = image if not MV_MODE else image['front']
|
250 |
return mesh, main_image, save_folder, stats, seed
|
251 |
|
252 |
+
|
253 |
def generation_all(
|
254 |
caption=None,
|
255 |
image=None,
|
|
|
304 |
path_textured = export_mesh(textured_mesh, save_folder, textured=True)
|
305 |
model_viewer_html_textured = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH,
|
306 |
textured=True)
|
307 |
+
if args.low_vram_mode:
|
308 |
+
torch.cuda.empty_cache()
|
309 |
return (
|
310 |
gr.update(value=path),
|
311 |
gr.update(value=path_textured),
|
|
|
314 |
seed,
|
315 |
)
|
316 |
|
317 |
+
|
318 |
def shape_generation(
|
319 |
caption=None,
|
320 |
image=None,
|
|
|
351 |
|
352 |
path = export_mesh(mesh, save_folder, textured=False)
|
353 |
model_viewer_html = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH)
|
354 |
+
if args.low_vram_mode:
|
355 |
+
torch.cuda.empty_cache()
|
356 |
return (
|
357 |
gr.update(value=path),
|
358 |
model_viewer_html,
|
|
|
367 |
title = 'Hunyuan3D-2mv: Image to 3D Generation with 1-4 Views'
|
368 |
if 'mini' in args.subfolder:
|
369 |
title = 'Hunyuan3D-2mini: Strong 0.6B Image to Shape Generator'
|
370 |
+
if TURBO_MODE:
|
371 |
+
title = title.replace(':', '-Turbo: Fast ')
|
372 |
|
373 |
title_html = f"""
|
374 |
<div style="font-size: 2em; font-weight: bold; text-align: center; margin-bottom: 5px">
|
|
|
393 |
.mv-image button .wrap {
|
394 |
font-size: 10px;
|
395 |
}
|
396 |
+
|
397 |
.mv-image .icon-wrap {
|
398 |
width: 20px;
|
399 |
}
|
400 |
+
|
401 |
"""
|
402 |
|
403 |
with gr.Blocks(theme=gr.themes.Base(), title='Hunyuan-3D-2.0', analytics_enabled=False, css=custom_css) as demo:
|
|
|
437 |
file_out = gr.File(label="File", visible=False)
|
438 |
file_out2 = gr.File(label="File", visible=False)
|
439 |
|
440 |
+
with gr.Tabs(selected='tab_options' if TURBO_MODE else 'tab_export'):
|
441 |
+
with gr.Tab("Options", id='tab_options', visible=TURBO_MODE):
|
442 |
+
gen_mode = gr.Radio(label='Generation Mode',
|
443 |
+
info='Recommendation: Turbo for most cases, Fast for very complex cases, Standard seldom use.',
|
444 |
+
choices=['Turbo', 'Fast', 'Standard'], value='Turbo')
|
445 |
+
decode_mode = gr.Radio(label='Decoding Mode',
|
446 |
+
info='The resolution for exporting mesh from generated vectset',
|
447 |
+
choices=['Low', 'Standard', 'High'],
|
448 |
+
value='Standard')
|
449 |
with gr.Tab('Advanced Options', id='tab_advanced_options'):
|
450 |
with gr.Row():
|
451 |
check_box_rembg = gr.Checkbox(value=True, label='Remove Background', min_width=100)
|
|
|
461 |
with gr.Row():
|
462 |
num_steps = gr.Slider(maximum=100,
|
463 |
minimum=1,
|
464 |
+
value=5 if 'turbo' in args.subfolder else 30,
|
465 |
step=1, label='Inference Steps')
|
466 |
octree_resolution = gr.Slider(maximum=512, minimum=16, value=256, label='Octree Resolution')
|
467 |
with gr.Row():
|
468 |
cfg_scale = gr.Number(value=5.0, label='Guidance Scale', min_width=100)
|
469 |
+
num_chunks = gr.Slider(maximum=5000000, minimum=1000, value=8000,
|
470 |
label='Number of Chunks', min_width=100)
|
|
|
471 |
with gr.Tab("Export", id='tab_export'):
|
472 |
with gr.Row():
|
473 |
file_type = gr.Dropdown(label='File Type', choices=SUPPORTED_FORMATS,
|
|
|
587 |
outputs=[tabs_output],
|
588 |
)
|
589 |
|
590 |
+
def on_gen_mode_change(value):
|
591 |
+
if value == 'Turbo':
|
592 |
+
return gr.update(value=5)
|
593 |
+
elif value == 'Fast':
|
594 |
+
return gr.update(value=10)
|
595 |
+
else:
|
596 |
+
return gr.update(value=30)
|
597 |
+
|
598 |
+
gen_mode.change(on_gen_mode_change, inputs=[gen_mode], outputs=[num_steps])
|
599 |
+
|
600 |
+
def on_decode_mode_change(value):
|
601 |
+
if value == 'Low':
|
602 |
+
return gr.update(value=196)
|
603 |
+
elif value == 'Standard':
|
604 |
+
return gr.update(value=256)
|
605 |
+
else:
|
606 |
+
return gr.update(value=384)
|
607 |
+
|
608 |
+
decode_mode.change(on_decode_mode_change, inputs=[decode_mode], outputs=[octree_resolution])
|
609 |
+
|
610 |
def on_export_click(file_out, file_out2, file_type, reduce_face, export_texture, target_face_num):
|
611 |
if file_out is None:
|
612 |
raise gr.Error('Please generate a mesh first.')
|
|
|
662 |
parser.add_argument('--port', type=int, default=7860)
|
663 |
parser.add_argument('--host', type=str, default='0.0.0.0')
|
664 |
parser.add_argument('--device', type=str, default='cuda')
|
665 |
+
parser.add_argument('--mc_algo', type=str, default='mc')
|
666 |
parser.add_argument('--cache-path', type=str, default='gradio_cache')
|
667 |
parser.add_argument('--enable_t23d', action='store_true')
|
668 |
parser.add_argument('--disable_tex', action='store_true')
|
669 |
+
parser.add_argument('--enable_flashvdm', action='store_true')
|
670 |
parser.add_argument('--compile', action='store_true')
|
671 |
+
parser.add_argument('--low_vram_mode', action='store_true')
|
672 |
args = parser.parse_args()
|
673 |
|
674 |
+
args.enable_flashvdm = True
|
675 |
SAVE_DIR = args.cache_path
|
676 |
os.makedirs(SAVE_DIR, exist_ok=True)
|
677 |
|
678 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
679 |
MV_MODE = 'mv' in args.model_path
|
680 |
+
TURBO_MODE = 'turbo' in args.subfolder
|
681 |
|
682 |
HTML_HEIGHT = 690 if MV_MODE else 650
|
683 |
HTML_WIDTH = 500
|
|
|
700 |
example_mvs = get_example_mv_list()
|
701 |
|
702 |
SUPPORTED_FORMATS = ['glb', 'obj', 'ply', 'stl']
|
703 |
+
|
|
|
704 |
HAS_TEXTUREGEN = False
|
705 |
if not args.disable_tex:
|
706 |
try:
|
707 |
from hy3dgen.texgen import Hunyuan3DPaintPipeline
|
708 |
|
709 |
texgen_worker = Hunyuan3DPaintPipeline.from_pretrained(args.texgen_model_path)
|
710 |
+
if args.low_vram_mode:
|
711 |
+
texgen_worker.enable_model_cpu_offload()
|
712 |
# Not help much, ignore for now.
|
713 |
# if args.compile:
|
714 |
# texgen_worker.models['delight_model'].pipeline.unet.compile()
|
|
|
738 |
i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
|
739 |
args.model_path,
|
740 |
subfolder=args.subfolder,
|
741 |
+
use_safetensors=True,
|
742 |
device=args.device,
|
743 |
)
|
744 |
+
if args.enable_flashvdm:
|
745 |
+
mc_algo = 'mc' if args.device in ['cpu', 'mps'] else args.mc_algo
|
746 |
+
i23d_worker.enable_flashvdm(mc_algo=mc_algo)
|
747 |
if args.compile:
|
748 |
i23d_worker.compile()
|
749 |
|
|
|
760 |
app.mount("/static", StaticFiles(directory=static_dir, html=True), name="static")
|
761 |
shutil.copytree('./assets/env_maps', os.path.join(static_dir, 'env_maps'), dirs_exist_ok=True)
|
762 |
|
763 |
+
if args.low_vram_mode:
|
764 |
+
torch.cuda.empty_cache()
|
765 |
demo = build_app()
|
766 |
app = gr.mount_gradio_app(app, demo, path="/")
|
767 |
+
uvicorn.run(app, host=args.host, port=args.port, workers=1)
|
hy3dgen/shapegen/__init__.py
CHANGED
@@ -13,5 +13,5 @@
|
|
13 |
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
14 |
|
15 |
from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
|
16 |
-
from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover
|
17 |
from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
|
|
|
13 |
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
14 |
|
15 |
from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
|
16 |
+
from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover
|
17 |
from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
|
hy3dgen/shapegen/models/__init__.py
CHANGED
@@ -25,4 +25,4 @@
|
|
25 |
|
26 |
from .autoencoders import ShapeVAE
|
27 |
from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
|
28 |
-
from .denoisers import Hunyuan3DDiT
|
|
|
25 |
|
26 |
from .autoencoders import ShapeVAE
|
27 |
from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
|
28 |
+
from .denoisers import HunYuanDiTPlain, Hunyuan3DDiT
|
hy3dgen/shapegen/models/conditioner.py
CHANGED
@@ -22,7 +22,6 @@
|
|
22 |
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
|
25 |
-
import numpy as np
|
26 |
import torch
|
27 |
import torch.nn as nn
|
28 |
from torchvision import transforms
|
@@ -34,26 +33,6 @@ from transformers import (
|
|
34 |
)
|
35 |
|
36 |
|
37 |
-
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
|
38 |
-
"""
|
39 |
-
embed_dim: output dimension for each position
|
40 |
-
pos: a list of positions to be encoded: size (M,)
|
41 |
-
out: (M, D)
|
42 |
-
"""
|
43 |
-
assert embed_dim % 2 == 0
|
44 |
-
omega = np.arange(embed_dim // 2, dtype=np.float64)
|
45 |
-
omega /= embed_dim / 2.
|
46 |
-
omega = 1. / 10000 ** omega # (D/2,)
|
47 |
-
|
48 |
-
pos = pos.reshape(-1) # (M,)
|
49 |
-
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
|
50 |
-
|
51 |
-
emb_sin = np.sin(out) # (M, D/2)
|
52 |
-
emb_cos = np.cos(out) # (M, D/2)
|
53 |
-
|
54 |
-
return np.concatenate([emb_sin, emb_cos], axis=1)
|
55 |
-
|
56 |
-
|
57 |
class ImageEncoder(nn.Module):
|
58 |
def __init__(
|
59 |
self,
|
@@ -88,7 +67,7 @@ class ImageEncoder(nn.Module):
|
|
88 |
]
|
89 |
)
|
90 |
|
91 |
-
def forward(self, image, mask=None, value_range=(-1, 1)
|
92 |
if value_range is not None:
|
93 |
low, high = value_range
|
94 |
image = (image - low) / (high - low)
|
@@ -103,7 +82,7 @@ class ImageEncoder(nn.Module):
|
|
103 |
|
104 |
return last_hidden_state
|
105 |
|
106 |
-
def unconditional_embedding(self, batch_size
|
107 |
device = next(self.model.parameters()).device
|
108 |
dtype = next(self.model.parameters()).dtype
|
109 |
zero = torch.zeros(
|
@@ -131,82 +110,11 @@ class DinoImageEncoder(ImageEncoder):
|
|
131 |
std = [0.229, 0.224, 0.225]
|
132 |
|
133 |
|
134 |
-
class DinoImageEncoderMV(DinoImageEncoder):
|
135 |
-
def __init__(
|
136 |
-
self,
|
137 |
-
version=None,
|
138 |
-
config=None,
|
139 |
-
use_cls_token=True,
|
140 |
-
image_size=224,
|
141 |
-
view_num=4,
|
142 |
-
**kwargs,
|
143 |
-
):
|
144 |
-
super().__init__(version, config, use_cls_token, image_size, **kwargs)
|
145 |
-
self.view_num = view_num
|
146 |
-
self.num_patches = self.num_patches
|
147 |
-
pos = np.arange(self.view_num, dtype=np.float32)
|
148 |
-
view_embedding = torch.from_numpy(
|
149 |
-
get_1d_sincos_pos_embed_from_grid(self.model.config.hidden_size, pos)).float()
|
150 |
-
|
151 |
-
view_embedding = view_embedding.unsqueeze(1).repeat(1, self.num_patches, 1)
|
152 |
-
self.view_embed = view_embedding.unsqueeze(0)
|
153 |
-
|
154 |
-
def forward(self, image, mask=None, value_range=(-1, 1), view_idxs=None):
|
155 |
-
if value_range is not None:
|
156 |
-
low, high = value_range
|
157 |
-
image = (image - low) / (high - low)
|
158 |
-
|
159 |
-
image = image.to(self.model.device, dtype=self.model.dtype)
|
160 |
-
|
161 |
-
bs, num_views, c, h, w = image.shape
|
162 |
-
image = image.view(bs * num_views, c, h, w)
|
163 |
-
|
164 |
-
inputs = self.transform(image)
|
165 |
-
outputs = self.model(inputs)
|
166 |
-
|
167 |
-
last_hidden_state = outputs.last_hidden_state
|
168 |
-
last_hidden_state = last_hidden_state.view(
|
169 |
-
bs, num_views, last_hidden_state.shape[-2],
|
170 |
-
last_hidden_state.shape[-1]
|
171 |
-
)
|
172 |
-
|
173 |
-
view_embedding = self.view_embed.to(last_hidden_state.dtype).to(last_hidden_state.device)
|
174 |
-
if view_idxs is not None:
|
175 |
-
assert len(view_idxs) == bs
|
176 |
-
view_embeddings = []
|
177 |
-
for i in range(bs):
|
178 |
-
view_idx = view_idxs[i]
|
179 |
-
assert num_views == len(view_idx)
|
180 |
-
view_embeddings.append(self.view_embed[:, view_idx, ...])
|
181 |
-
view_embedding = torch.cat(view_embeddings, 0).to(last_hidden_state.dtype).to(last_hidden_state.device)
|
182 |
-
|
183 |
-
if num_views != self.view_num:
|
184 |
-
view_embedding = view_embedding[:, :num_views, ...]
|
185 |
-
last_hidden_state = last_hidden_state + view_embedding
|
186 |
-
last_hidden_state = last_hidden_state.view(bs, num_views * last_hidden_state.shape[-2],
|
187 |
-
last_hidden_state.shape[-1])
|
188 |
-
return last_hidden_state
|
189 |
-
|
190 |
-
def unconditional_embedding(self, batch_size, view_idxs=None, **kwargs):
|
191 |
-
device = next(self.model.parameters()).device
|
192 |
-
dtype = next(self.model.parameters()).dtype
|
193 |
-
zero = torch.zeros(
|
194 |
-
batch_size,
|
195 |
-
self.num_patches * len(view_idxs[0]),
|
196 |
-
self.model.config.hidden_size,
|
197 |
-
device=device,
|
198 |
-
dtype=dtype,
|
199 |
-
)
|
200 |
-
return zero
|
201 |
-
|
202 |
-
|
203 |
def build_image_encoder(config):
|
204 |
if config['type'] == 'CLIPImageEncoder':
|
205 |
return CLIPImageEncoder(**config['kwargs'])
|
206 |
elif config['type'] == 'DinoImageEncoder':
|
207 |
return DinoImageEncoder(**config['kwargs'])
|
208 |
-
elif config['type'] == 'DinoImageEncoderMV':
|
209 |
-
return DinoImageEncoderMV(**config['kwargs'])
|
210 |
else:
|
211 |
raise ValueError(f'Unknown image encoder type: {config["type"]}')
|
212 |
|
@@ -221,17 +129,17 @@ class DualImageEncoder(nn.Module):
|
|
221 |
self.main_image_encoder = build_image_encoder(main_image_encoder)
|
222 |
self.additional_image_encoder = build_image_encoder(additional_image_encoder)
|
223 |
|
224 |
-
def forward(self, image, mask=None
|
225 |
outputs = {
|
226 |
-
'main': self.main_image_encoder(image, mask=mask
|
227 |
-
'additional': self.additional_image_encoder(image, mask=mask
|
228 |
}
|
229 |
return outputs
|
230 |
|
231 |
-
def unconditional_embedding(self, batch_size
|
232 |
outputs = {
|
233 |
-
'main': self.main_image_encoder.unconditional_embedding(batch_size
|
234 |
-
'additional': self.additional_image_encoder.unconditional_embedding(batch_size
|
235 |
}
|
236 |
return outputs
|
237 |
|
@@ -244,14 +152,14 @@ class SingleImageEncoder(nn.Module):
|
|
244 |
super().__init__()
|
245 |
self.main_image_encoder = build_image_encoder(main_image_encoder)
|
246 |
|
247 |
-
def forward(self, image, mask=None
|
248 |
outputs = {
|
249 |
-
'main': self.main_image_encoder(image, mask=mask
|
250 |
}
|
251 |
return outputs
|
252 |
|
253 |
-
def unconditional_embedding(self, batch_size
|
254 |
outputs = {
|
255 |
-
'main': self.main_image_encoder.unconditional_embedding(batch_size
|
256 |
}
|
257 |
return outputs
|
|
|
22 |
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
|
|
|
25 |
import torch
|
26 |
import torch.nn as nn
|
27 |
from torchvision import transforms
|
|
|
33 |
)
|
34 |
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
class ImageEncoder(nn.Module):
|
37 |
def __init__(
|
38 |
self,
|
|
|
67 |
]
|
68 |
)
|
69 |
|
70 |
+
def forward(self, image, mask=None, value_range=(-1, 1)):
|
71 |
if value_range is not None:
|
72 |
low, high = value_range
|
73 |
image = (image - low) / (high - low)
|
|
|
82 |
|
83 |
return last_hidden_state
|
84 |
|
85 |
+
def unconditional_embedding(self, batch_size):
|
86 |
device = next(self.model.parameters()).device
|
87 |
dtype = next(self.model.parameters()).dtype
|
88 |
zero = torch.zeros(
|
|
|
110 |
std = [0.229, 0.224, 0.225]
|
111 |
|
112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
def build_image_encoder(config):
|
114 |
if config['type'] == 'CLIPImageEncoder':
|
115 |
return CLIPImageEncoder(**config['kwargs'])
|
116 |
elif config['type'] == 'DinoImageEncoder':
|
117 |
return DinoImageEncoder(**config['kwargs'])
|
|
|
|
|
118 |
else:
|
119 |
raise ValueError(f'Unknown image encoder type: {config["type"]}')
|
120 |
|
|
|
129 |
self.main_image_encoder = build_image_encoder(main_image_encoder)
|
130 |
self.additional_image_encoder = build_image_encoder(additional_image_encoder)
|
131 |
|
132 |
+
def forward(self, image, mask=None):
|
133 |
outputs = {
|
134 |
+
'main': self.main_image_encoder(image, mask=mask),
|
135 |
+
'additional': self.additional_image_encoder(image, mask=mask),
|
136 |
}
|
137 |
return outputs
|
138 |
|
139 |
+
def unconditional_embedding(self, batch_size):
|
140 |
outputs = {
|
141 |
+
'main': self.main_image_encoder.unconditional_embedding(batch_size),
|
142 |
+
'additional': self.additional_image_encoder.unconditional_embedding(batch_size),
|
143 |
}
|
144 |
return outputs
|
145 |
|
|
|
152 |
super().__init__()
|
153 |
self.main_image_encoder = build_image_encoder(main_image_encoder)
|
154 |
|
155 |
+
def forward(self, image, mask=None):
|
156 |
outputs = {
|
157 |
+
'main': self.main_image_encoder(image, mask=mask),
|
158 |
}
|
159 |
return outputs
|
160 |
|
161 |
+
def unconditional_embedding(self, batch_size):
|
162 |
outputs = {
|
163 |
+
'main': self.main_image_encoder.unconditional_embedding(batch_size),
|
164 |
}
|
165 |
return outputs
|
hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py
CHANGED
@@ -60,15 +60,6 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
|
|
60 |
return embedding
|
61 |
|
62 |
|
63 |
-
class GELU(nn.Module):
|
64 |
-
def __init__(self, approximate='tanh'):
|
65 |
-
super().__init__()
|
66 |
-
self.approximate = approximate
|
67 |
-
|
68 |
-
def forward(self, x: Tensor) -> Tensor:
|
69 |
-
return nn.functional.gelu(x.contiguous(), approximate=self.approximate)
|
70 |
-
|
71 |
-
|
72 |
class MLPEmbedder(nn.Module):
|
73 |
def __init__(self, in_dim: int, hidden_dim: int):
|
74 |
super().__init__()
|
@@ -171,7 +162,7 @@ class DoubleStreamBlock(nn.Module):
|
|
171 |
self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
172 |
self.img_mlp = nn.Sequential(
|
173 |
nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
|
174 |
-
GELU(approximate="tanh"),
|
175 |
nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
|
176 |
)
|
177 |
|
@@ -182,7 +173,7 @@ class DoubleStreamBlock(nn.Module):
|
|
182 |
self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
183 |
self.txt_mlp = nn.Sequential(
|
184 |
nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
|
185 |
-
GELU(approximate="tanh"),
|
186 |
nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
|
187 |
)
|
188 |
|
@@ -248,7 +239,7 @@ class SingleStreamBlock(nn.Module):
|
|
248 |
self.hidden_size = hidden_size
|
249 |
self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
250 |
|
251 |
-
self.mlp_act = GELU(approximate="tanh")
|
252 |
self.modulation = Modulation(hidden_size, double=False)
|
253 |
|
254 |
def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
|
|
|
60 |
return embedding
|
61 |
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
class MLPEmbedder(nn.Module):
|
64 |
def __init__(self, in_dim: int, hidden_dim: int):
|
65 |
super().__init__()
|
|
|
162 |
self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
163 |
self.img_mlp = nn.Sequential(
|
164 |
nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
|
165 |
+
nn.GELU(approximate="tanh"),
|
166 |
nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
|
167 |
)
|
168 |
|
|
|
173 |
self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
174 |
self.txt_mlp = nn.Sequential(
|
175 |
nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
|
176 |
+
nn.GELU(approximate="tanh"),
|
177 |
nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
|
178 |
)
|
179 |
|
|
|
239 |
self.hidden_size = hidden_size
|
240 |
self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
241 |
|
242 |
+
self.mlp_act = nn.GELU(approximate="tanh")
|
243 |
self.modulation = Modulation(hidden_size, double=False)
|
244 |
|
245 |
def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
|
hy3dgen/shapegen/pipelines.py
CHANGED
@@ -24,12 +24,11 @@ import trimesh
|
|
24 |
import yaml
|
25 |
from PIL import Image
|
26 |
from diffusers.utils.torch_utils import randn_tensor
|
27 |
-
from diffusers.utils.import_utils import is_accelerate_version, is_accelerate_available
|
28 |
from tqdm import tqdm
|
29 |
|
30 |
from .models.autoencoders import ShapeVAE
|
31 |
from .models.autoencoders import SurfaceExtractors
|
32 |
-
from .utils import logger, synchronize_timer
|
33 |
|
34 |
|
35 |
def retrieve_timesteps(
|
@@ -128,9 +127,6 @@ def instantiate_from_config(config, **kwargs):
|
|
128 |
|
129 |
|
130 |
class Hunyuan3DDiTPipeline:
|
131 |
-
model_cpu_offload_seq = "conditioner->model->vae"
|
132 |
-
_exclude_from_cpu_offload = []
|
133 |
-
|
134 |
@classmethod
|
135 |
@synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
|
136 |
def from_single_file(
|
@@ -211,12 +207,34 @@ class Hunyuan3DDiTPipeline:
|
|
211 |
dtype=dtype,
|
212 |
device=device,
|
213 |
)
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
return cls.from_single_file(
|
221 |
ckpt_path,
|
222 |
config_path,
|
@@ -261,18 +279,12 @@ class Hunyuan3DDiTPipeline:
|
|
261 |
if enabled:
|
262 |
model_path = self.kwargs['from_pretrained_kwargs']['model_path']
|
263 |
turbo_vae_mapping = {
|
264 |
-
'Hunyuan3D-2':
|
265 |
-
'Hunyuan3D-
|
266 |
-
'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini-turbo'),
|
267 |
}
|
268 |
model_name = model_path.split('/')[-1]
|
269 |
if replace_vae and model_name in turbo_vae_mapping:
|
270 |
-
model_path, subfolder
|
271 |
-
self.vae = ShapeVAE.from_pretrained(
|
272 |
-
model_path, subfolder=subfolder,
|
273 |
-
use_safetensors=self.kwargs['from_pretrained_kwargs']['use_safetensors'],
|
274 |
-
device=self.device,
|
275 |
-
)
|
276 |
self.vae.enable_flashvdm_decoder(
|
277 |
enabled=enabled,
|
278 |
adaptive_kv_selection=adaptive_kv_selection,
|
@@ -282,146 +294,33 @@ class Hunyuan3DDiTPipeline:
|
|
282 |
else:
|
283 |
model_path = self.kwargs['from_pretrained_kwargs']['model_path']
|
284 |
vae_mapping = {
|
285 |
-
'Hunyuan3D-2':
|
286 |
-
'Hunyuan3D-
|
287 |
-
'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini'),
|
288 |
}
|
289 |
model_name = model_path.split('/')[-1]
|
290 |
if model_name in vae_mapping:
|
291 |
-
model_path, subfolder
|
292 |
-
self.vae = ShapeVAE.from_pretrained(model_path, subfolder=subfolder)
|
293 |
self.vae.enable_flashvdm_decoder(enabled=False)
|
294 |
|
295 |
def to(self, device=None, dtype=None):
|
296 |
-
if dtype is not None:
|
297 |
-
self.dtype = dtype
|
298 |
-
self.vae.to(dtype=dtype)
|
299 |
-
self.model.to(dtype=dtype)
|
300 |
-
self.conditioner.to(dtype=dtype)
|
301 |
if device is not None:
|
302 |
self.device = torch.device(device)
|
303 |
self.vae.to(device)
|
304 |
self.model.to(device)
|
305 |
self.conditioner.to(device)
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
[`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
|
312 |
-
Accelerate's module hooks.
|
313 |
-
"""
|
314 |
-
for name, model in self.components.items():
|
315 |
-
if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload:
|
316 |
-
continue
|
317 |
-
|
318 |
-
if not hasattr(model, "_hf_hook"):
|
319 |
-
return self.device
|
320 |
-
for module in model.modules():
|
321 |
-
if (
|
322 |
-
hasattr(module, "_hf_hook")
|
323 |
-
and hasattr(module._hf_hook, "execution_device")
|
324 |
-
and module._hf_hook.execution_device is not None
|
325 |
-
):
|
326 |
-
return torch.device(module._hf_hook.execution_device)
|
327 |
-
return self.device
|
328 |
-
|
329 |
-
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
330 |
-
r"""
|
331 |
-
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
|
332 |
-
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
|
333 |
-
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
|
334 |
-
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
|
335 |
-
|
336 |
-
Arguments:
|
337 |
-
gpu_id (`int`, *optional*):
|
338 |
-
The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
|
339 |
-
device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
|
340 |
-
The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
|
341 |
-
default to "cuda".
|
342 |
-
"""
|
343 |
-
if self.model_cpu_offload_seq is None:
|
344 |
-
raise ValueError(
|
345 |
-
"Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
|
346 |
-
)
|
347 |
-
|
348 |
-
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
|
349 |
-
from accelerate import cpu_offload_with_hook
|
350 |
-
else:
|
351 |
-
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
|
352 |
-
|
353 |
-
torch_device = torch.device(device)
|
354 |
-
device_index = torch_device.index
|
355 |
-
|
356 |
-
if gpu_id is not None and device_index is not None:
|
357 |
-
raise ValueError(
|
358 |
-
f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
|
359 |
-
f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
|
360 |
-
)
|
361 |
-
|
362 |
-
# _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
|
363 |
-
self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
|
364 |
-
|
365 |
-
device_type = torch_device.type
|
366 |
-
device = torch.device(f"{device_type}:{self._offload_gpu_id}")
|
367 |
-
|
368 |
-
if self.device.type != "cpu":
|
369 |
-
self.to("cpu")
|
370 |
-
device_mod = getattr(torch, self.device.type, None)
|
371 |
-
if hasattr(device_mod, "empty_cache") and device_mod.is_available():
|
372 |
-
device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
|
373 |
-
|
374 |
-
all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
|
375 |
-
|
376 |
-
self._all_hooks = []
|
377 |
-
hook = None
|
378 |
-
for model_str in self.model_cpu_offload_seq.split("->"):
|
379 |
-
model = all_model_components.pop(model_str, None)
|
380 |
-
if not isinstance(model, torch.nn.Module):
|
381 |
-
continue
|
382 |
-
|
383 |
-
_, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
|
384 |
-
self._all_hooks.append(hook)
|
385 |
-
|
386 |
-
# CPU offload models that are not in the seq chain unless they are explicitly excluded
|
387 |
-
# these models will stay on CPU until maybe_free_model_hooks is called
|
388 |
-
# some models cannot be in the seq chain because they are iteratively called, such as controlnet
|
389 |
-
for name, model in all_model_components.items():
|
390 |
-
if not isinstance(model, torch.nn.Module):
|
391 |
-
continue
|
392 |
-
|
393 |
-
if name in self._exclude_from_cpu_offload:
|
394 |
-
model.to(device)
|
395 |
-
else:
|
396 |
-
_, hook = cpu_offload_with_hook(model, device)
|
397 |
-
self._all_hooks.append(hook)
|
398 |
-
|
399 |
-
def maybe_free_model_hooks(self):
|
400 |
-
r"""
|
401 |
-
Function that offloads all components, removes all model hooks that were added when using
|
402 |
-
`enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function
|
403 |
-
is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it
|
404 |
-
functions correctly when applying enable_model_cpu_offload.
|
405 |
-
"""
|
406 |
-
if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
|
407 |
-
# `enable_model_cpu_offload` has not be called, so silently do nothing
|
408 |
-
return
|
409 |
-
|
410 |
-
for hook in self._all_hooks:
|
411 |
-
# offload model and remove hook from model
|
412 |
-
hook.offload()
|
413 |
-
hook.remove()
|
414 |
-
|
415 |
-
# make sure the model is in the same state as before calling it
|
416 |
-
self.enable_model_cpu_offload()
|
417 |
|
418 |
@synchronize_timer('Encode cond')
|
419 |
-
def encode_cond(self, image,
|
420 |
bsz = image.shape[0]
|
421 |
-
cond = self.conditioner(image=image,
|
422 |
|
423 |
if do_classifier_free_guidance:
|
424 |
-
un_cond = self.conditioner.unconditional_embedding(bsz
|
425 |
|
426 |
if dual_guidance:
|
427 |
un_cond_drop_main = copy.deepcopy(un_cond)
|
@@ -437,6 +336,8 @@ class Hunyuan3DDiTPipeline:
|
|
437 |
|
438 |
cond = cat_recursive(cond, un_cond_drop_main, un_cond)
|
439 |
else:
|
|
|
|
|
440 |
def cat_recursive(a, b):
|
441 |
if isinstance(a, torch.Tensor):
|
442 |
return torch.cat([a, b], dim=0).to(self.dtype)
|
@@ -482,27 +383,25 @@ class Hunyuan3DDiTPipeline:
|
|
482 |
latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
|
483 |
return latents
|
484 |
|
485 |
-
def prepare_image(self, image)
|
486 |
if isinstance(image, str) and not os.path.exists(image):
|
487 |
raise FileNotFoundError(f"Couldn't find image at path {image}")
|
488 |
|
489 |
if not isinstance(image, list):
|
490 |
image = [image]
|
491 |
-
|
492 |
-
|
493 |
for img in image:
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
cond_input = {k: [] for k in outputs[0].keys()}
|
498 |
-
for output in outputs:
|
499 |
-
for key, value in output.items():
|
500 |
-
cond_input[key].append(value)
|
501 |
-
for key, value in cond_input.items():
|
502 |
-
if isinstance(value[0], torch.Tensor):
|
503 |
-
cond_input[key] = torch.cat(value, dim=0)
|
504 |
|
505 |
-
|
|
|
|
|
|
|
|
|
|
|
506 |
|
507 |
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
508 |
"""
|
@@ -575,14 +474,10 @@ class Hunyuan3DDiTPipeline:
|
|
575 |
getattr(self.model, 'guidance_cond_proj_dim', None) is None
|
576 |
dual_guidance = dual_guidance_scale >= 0 and dual_guidance
|
577 |
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
additional_cond_inputs=cond_inputs,
|
583 |
-
do_classifier_free_guidance=do_classifier_free_guidance,
|
584 |
-
dual_guidance=False,
|
585 |
-
)
|
586 |
batch_size = image.shape[0]
|
587 |
|
588 |
t_dtype = torch.long
|
@@ -640,17 +535,7 @@ class Hunyuan3DDiTPipeline:
|
|
640 |
box_v, mc_level, num_chunks, octree_resolution, mc_algo,
|
641 |
)
|
642 |
|
643 |
-
def _export(
|
644 |
-
self,
|
645 |
-
latents,
|
646 |
-
output_type='trimesh',
|
647 |
-
box_v=1.01,
|
648 |
-
mc_level=0.0,
|
649 |
-
num_chunks=20000,
|
650 |
-
octree_resolution=256,
|
651 |
-
mc_algo='mc',
|
652 |
-
enable_pbar=True
|
653 |
-
):
|
654 |
if not output_type == "latent":
|
655 |
latents = 1. / self.vae.scale_factor * latents
|
656 |
latents = self.vae(latents)
|
@@ -677,7 +562,7 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
|
|
677 |
@torch.inference_mode()
|
678 |
def __call__(
|
679 |
self,
|
680 |
-
image: Union[str, List[str], Image.Image
|
681 |
num_inference_steps: int = 50,
|
682 |
timesteps: List[int] = None,
|
683 |
sigmas: List[float] = None,
|
@@ -705,11 +590,10 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
|
|
705 |
self.model.guidance_embed is True
|
706 |
)
|
707 |
|
708 |
-
|
709 |
-
image = cond_inputs.pop('image')
|
710 |
cond = self.encode_cond(
|
711 |
image=image,
|
712 |
-
|
713 |
do_classifier_free_guidance=do_classifier_free_guidance,
|
714 |
dual_guidance=False,
|
715 |
)
|
|
|
24 |
import yaml
|
25 |
from PIL import Image
|
26 |
from diffusers.utils.torch_utils import randn_tensor
|
|
|
27 |
from tqdm import tqdm
|
28 |
|
29 |
from .models.autoencoders import ShapeVAE
|
30 |
from .models.autoencoders import SurfaceExtractors
|
31 |
+
from .utils import logger, synchronize_timer
|
32 |
|
33 |
|
34 |
def retrieve_timesteps(
|
|
|
127 |
|
128 |
|
129 |
class Hunyuan3DDiTPipeline:
|
|
|
|
|
|
|
130 |
@classmethod
|
131 |
@synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
|
132 |
def from_single_file(
|
|
|
207 |
dtype=dtype,
|
208 |
device=device,
|
209 |
)
|
210 |
+
original_model_path = model_path
|
211 |
+
# try local path
|
212 |
+
base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
|
213 |
+
model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder))
|
214 |
+
logger.info(f'Try to load model from local path: {model_path}')
|
215 |
+
if not os.path.exists(model_path):
|
216 |
+
logger.info('Model path not exists, try to download from huggingface')
|
217 |
+
try:
|
218 |
+
import huggingface_hub
|
219 |
+
# download from huggingface
|
220 |
+
path = huggingface_hub.snapshot_download(repo_id=original_model_path)
|
221 |
+
model_path = os.path.join(path, subfolder)
|
222 |
+
except ImportError:
|
223 |
+
logger.warning(
|
224 |
+
"You need to install HuggingFace Hub to load models from the hub."
|
225 |
+
)
|
226 |
+
raise RuntimeError(f"Model path {model_path} not found")
|
227 |
+
except Exception as e:
|
228 |
+
raise e
|
229 |
+
|
230 |
+
if not os.path.exists(model_path):
|
231 |
+
raise FileNotFoundError(f"Model path {original_model_path} not found")
|
232 |
+
|
233 |
+
extension = 'ckpt' if not use_safetensors else 'safetensors'
|
234 |
+
variant = '' if variant is None else f'.{variant}'
|
235 |
+
ckpt_name = f'model{variant}.{extension}'
|
236 |
+
config_path = os.path.join(model_path, 'config.yaml')
|
237 |
+
ckpt_path = os.path.join(model_path, ckpt_name)
|
238 |
return cls.from_single_file(
|
239 |
ckpt_path,
|
240 |
config_path,
|
|
|
279 |
if enabled:
|
280 |
model_path = self.kwargs['from_pretrained_kwargs']['model_path']
|
281 |
turbo_vae_mapping = {
|
282 |
+
'Hunyuan3D-2': 'hunyuan3d-vae-v2-0-turbo',
|
283 |
+
'Hunyuan3D-2s': 'hunyuan3d-vae-v2-s-turbo'
|
|
|
284 |
}
|
285 |
model_name = model_path.split('/')[-1]
|
286 |
if replace_vae and model_name in turbo_vae_mapping:
|
287 |
+
self.vae = ShapeVAE.from_pretrained(model_path, subfolder=turbo_vae_mapping[model_name])
|
|
|
|
|
|
|
|
|
|
|
288 |
self.vae.enable_flashvdm_decoder(
|
289 |
enabled=enabled,
|
290 |
adaptive_kv_selection=adaptive_kv_selection,
|
|
|
294 |
else:
|
295 |
model_path = self.kwargs['from_pretrained_kwargs']['model_path']
|
296 |
vae_mapping = {
|
297 |
+
'Hunyuan3D-2': 'hunyuan3d-vae-v2-0',
|
298 |
+
'Hunyuan3D-2s': 'hunyuan3d-vae-v2-s'
|
|
|
299 |
}
|
300 |
model_name = model_path.split('/')[-1]
|
301 |
if model_name in vae_mapping:
|
302 |
+
self.vae = ShapeVAE.from_pretrained(model_path, subfolder=vae_mapping[model_name])
|
|
|
303 |
self.vae.enable_flashvdm_decoder(enabled=False)
|
304 |
|
305 |
def to(self, device=None, dtype=None):
|
|
|
|
|
|
|
|
|
|
|
306 |
if device is not None:
|
307 |
self.device = torch.device(device)
|
308 |
self.vae.to(device)
|
309 |
self.model.to(device)
|
310 |
self.conditioner.to(device)
|
311 |
+
if dtype is not None:
|
312 |
+
self.dtype = dtype
|
313 |
+
self.vae.to(dtype=dtype)
|
314 |
+
self.model.to(dtype=dtype)
|
315 |
+
self.conditioner.to(dtype=dtype)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
|
317 |
@synchronize_timer('Encode cond')
|
318 |
+
def encode_cond(self, image, mask, do_classifier_free_guidance, dual_guidance):
|
319 |
bsz = image.shape[0]
|
320 |
+
cond = self.conditioner(image=image, mask=mask)
|
321 |
|
322 |
if do_classifier_free_guidance:
|
323 |
+
un_cond = self.conditioner.unconditional_embedding(bsz)
|
324 |
|
325 |
if dual_guidance:
|
326 |
un_cond_drop_main = copy.deepcopy(un_cond)
|
|
|
336 |
|
337 |
cond = cat_recursive(cond, un_cond_drop_main, un_cond)
|
338 |
else:
|
339 |
+
un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
|
340 |
+
|
341 |
def cat_recursive(a, b):
|
342 |
if isinstance(a, torch.Tensor):
|
343 |
return torch.cat([a, b], dim=0).to(self.dtype)
|
|
|
383 |
latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
|
384 |
return latents
|
385 |
|
386 |
+
def prepare_image(self, image):
|
387 |
if isinstance(image, str) and not os.path.exists(image):
|
388 |
raise FileNotFoundError(f"Couldn't find image at path {image}")
|
389 |
|
390 |
if not isinstance(image, list):
|
391 |
image = [image]
|
392 |
+
image_pts = []
|
393 |
+
mask_pts = []
|
394 |
for img in image:
|
395 |
+
image_pt, mask_pt = self.image_processor(img, return_mask=True)
|
396 |
+
image_pts.append(image_pt)
|
397 |
+
mask_pts.append(mask_pt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
|
399 |
+
image_pts = torch.cat(image_pts, dim=0).to(self.device, dtype=self.dtype)
|
400 |
+
if mask_pts[0] is not None:
|
401 |
+
mask_pts = torch.cat(mask_pts, dim=0).to(self.device, dtype=self.dtype)
|
402 |
+
else:
|
403 |
+
mask_pts = None
|
404 |
+
return image_pts, mask_pts
|
405 |
|
406 |
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
407 |
"""
|
|
|
474 |
getattr(self.model, 'guidance_cond_proj_dim', None) is None
|
475 |
dual_guidance = dual_guidance_scale >= 0 and dual_guidance
|
476 |
|
477 |
+
image, mask = self.prepare_image(image)
|
478 |
+
cond = self.encode_cond(image=image,
|
479 |
+
do_classifier_free_guidance=do_classifier_free_guidance,
|
480 |
+
dual_guidance=dual_guidance)
|
|
|
|
|
|
|
|
|
481 |
batch_size = image.shape[0]
|
482 |
|
483 |
t_dtype = torch.long
|
|
|
535 |
box_v, mc_level, num_chunks, octree_resolution, mc_algo,
|
536 |
)
|
537 |
|
538 |
+
def _export(self, latents, output_type, box_v, mc_level, num_chunks, octree_resolution, mc_algo, enable_pbar=True):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
539 |
if not output_type == "latent":
|
540 |
latents = 1. / self.vae.scale_factor * latents
|
541 |
latents = self.vae(latents)
|
|
|
562 |
@torch.inference_mode()
|
563 |
def __call__(
|
564 |
self,
|
565 |
+
image: Union[str, List[str], Image.Image] = None,
|
566 |
num_inference_steps: int = 50,
|
567 |
timesteps: List[int] = None,
|
568 |
sigmas: List[float] = None,
|
|
|
590 |
self.model.guidance_embed is True
|
591 |
)
|
592 |
|
593 |
+
image, mask = self.prepare_image(image)
|
|
|
594 |
cond = self.encode_cond(
|
595 |
image=image,
|
596 |
+
mask=mask,
|
597 |
do_classifier_free_guidance=do_classifier_free_guidance,
|
598 |
dual_guidance=False,
|
599 |
)
|
hy3dgen/shapegen/postprocessors.py
CHANGED
@@ -12,16 +12,13 @@
|
|
12 |
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
13 |
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
14 |
|
15 |
-
import os
|
16 |
import tempfile
|
17 |
from typing import Union
|
18 |
|
19 |
-
import numpy as np
|
20 |
import pymeshlab
|
21 |
-
import torch
|
22 |
import trimesh
|
23 |
|
24 |
-
from .models.
|
25 |
from .utils import synchronize_timer
|
26 |
|
27 |
|
|
|
12 |
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
13 |
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
14 |
|
|
|
15 |
import tempfile
|
16 |
from typing import Union
|
17 |
|
|
|
18 |
import pymeshlab
|
|
|
19 |
import trimesh
|
20 |
|
21 |
+
from .models.vae import Latent2MeshOutput
|
22 |
from .utils import synchronize_timer
|
23 |
|
24 |
|
hy3dgen/shapegen/preprocessors.py
CHANGED
@@ -87,7 +87,9 @@ class ImageProcessorV2:
|
|
87 |
mask = mask.clip(0, 255).astype(np.uint8)
|
88 |
return result, mask
|
89 |
|
90 |
-
def
|
|
|
|
|
91 |
if isinstance(image, str):
|
92 |
image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
|
93 |
image, mask = self.recenter(image, border_ratio=border_ratio)
|
@@ -104,64 +106,13 @@ class ImageProcessorV2:
|
|
104 |
if to_tensor:
|
105 |
image = array_to_tensor(image)
|
106 |
mask = array_to_tensor(mask)
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
if self.border_ratio is not None:
|
111 |
-
border_ratio = self.border_ratio
|
112 |
-
image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
|
113 |
-
outputs = {
|
114 |
-
'image': image,
|
115 |
-
'mask': mask
|
116 |
-
}
|
117 |
-
return outputs
|
118 |
-
|
119 |
-
|
120 |
-
class MVImageProcessorV2(ImageProcessorV2):
|
121 |
-
"""
|
122 |
-
view order: front, front clockwise 90, back, front clockwise 270
|
123 |
-
"""
|
124 |
-
return_view_idx = True
|
125 |
-
|
126 |
-
def __init__(self, size=512, border_ratio=None):
|
127 |
-
super().__init__(size, border_ratio)
|
128 |
-
self.view2idx = {
|
129 |
-
'front': 0,
|
130 |
-
'left': 1,
|
131 |
-
'back': 2,
|
132 |
-
'right': 3
|
133 |
-
}
|
134 |
-
|
135 |
-
def __call__(self, image_dict, border_ratio=0.15, to_tensor=True, **kwargs):
|
136 |
-
if self.border_ratio is not None:
|
137 |
-
border_ratio = self.border_ratio
|
138 |
-
|
139 |
-
images = []
|
140 |
-
masks = []
|
141 |
-
view_idxs = []
|
142 |
-
for idx, (view_tag, image) in enumerate(image_dict.items()):
|
143 |
-
view_idxs.append(self.view2idx[view_tag])
|
144 |
-
image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
|
145 |
-
images.append(image)
|
146 |
-
masks.append(mask)
|
147 |
-
|
148 |
-
zipped_lists = zip(view_idxs, images, masks)
|
149 |
-
sorted_zipped_lists = sorted(zipped_lists)
|
150 |
-
view_idxs, images, masks = zip(*sorted_zipped_lists)
|
151 |
-
|
152 |
-
image = torch.cat(images, 0).unsqueeze(0)
|
153 |
-
mask = torch.cat(masks, 0).unsqueeze(0)
|
154 |
-
outputs = {
|
155 |
-
'image': image,
|
156 |
-
'mask': mask,
|
157 |
-
'view_idxs': view_idxs
|
158 |
-
}
|
159 |
-
return outputs
|
160 |
|
161 |
|
162 |
IMAGE_PROCESSORS = {
|
163 |
"v2": ImageProcessorV2,
|
164 |
-
'mv_v2': MVImageProcessorV2,
|
165 |
}
|
166 |
|
167 |
DEFAULT_IMAGEPROCESSOR = 'v2'
|
|
|
87 |
mask = mask.clip(0, 255).astype(np.uint8)
|
88 |
return result, mask
|
89 |
|
90 |
+
def __call__(self, image, border_ratio=0.15, to_tensor=True, return_mask=False, **kwargs):
|
91 |
+
if self.border_ratio is not None:
|
92 |
+
border_ratio = self.border_ratio
|
93 |
if isinstance(image, str):
|
94 |
image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
|
95 |
image, mask = self.recenter(image, border_ratio=border_ratio)
|
|
|
106 |
if to_tensor:
|
107 |
image = array_to_tensor(image)
|
108 |
mask = array_to_tensor(mask)
|
109 |
+
if return_mask:
|
110 |
+
return image, mask
|
111 |
+
return image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
|
114 |
IMAGE_PROCESSORS = {
|
115 |
"v2": ImageProcessorV2,
|
|
|
116 |
}
|
117 |
|
118 |
DEFAULT_IMAGEPROCESSOR = 'v2'
|