model: | |
target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain | |
params: | |
input_size: &num_latents 4096 | |
in_channels: 64 | |
hidden_size: 2048 | |
context_dim: 1024 | |
depth: 21 | |
num_heads: 16 | |
qk_norm: true | |
text_len: 1370 | |
with_decoupled_ca: false | |
use_attention_pooling: false | |
qk_norm_type: 'rms' | |
qkv_bias: false | |
use_pos_emb: false | |
num_moe_layers: 6 | |
num_experts: 8 | |
moe_top_k: 2 | |
vae: | |
target: hy3dshape.models.autoencoders.ShapeVAE | |
params: | |
num_latents: | |
embed_dim: 64 | |
num_freqs: 8 | |
include_pi: false | |
heads: 16 | |
width: 1024 | |
num_encoder_layers: 8 | |
num_decoder_layers: 16 | |
qkv_bias: false | |
qk_norm: true | |
scale_factor: 1.0039506158752403 | |
geo_decoder_mlp_expand_ratio: 4 | |
geo_decoder_downsample_ratio: 1 | |
geo_decoder_ln_post: true | |
point_feats: 4 | |
pc_size: 81920 | |
pc_sharpedge_size: 0 | |
conditioner: | |
target: hy3dshape.models.conditioner.SingleImageEncoder | |
params: | |
main_image_encoder: | |
type: DinoImageEncoder # dino large | |
kwargs: | |
config: | |
attention_probs_dropout_prob: 0.0 | |
drop_path_rate: 0.0 | |
hidden_act: gelu | |
hidden_dropout_prob: 0.0 | |
hidden_size: 1024 | |
image_size: 518 | |
initializer_range: 0.02 | |
layer_norm_eps: 1.e-6 | |
layerscale_value: 1.0 | |
mlp_ratio: 4 | |
model_type: dinov2 | |
num_attention_heads: 16 | |
num_channels: 3 | |
num_hidden_layers: 24 | |
patch_size: 14 | |
qkv_bias: true | |
torch_dtype: float32 | |
use_swiglu_ffn: false | |
image_size: 518 | |
use_cls_token: true | |
scheduler: | |
target: hy3dshape.schedulers.FlowMatchEulerDiscreteScheduler | |
params: | |
num_train_timesteps: 1000 | |
image_processor: | |
target: hy3dshape.preprocessors.ImageProcessorV2 | |
params: | |
size: 512 | |
border_ratio: 0.15 | |
pipeline: | |
target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline | |