LatentSync / configs /unet /second_stage.yaml
ndkhanh95's picture
Upload 170 files
aa0c2cb verified
data:
syncnet_config_path: configs/syncnet/syncnet_16_pixel.yaml
train_output_dir: debug/unet
train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/all_data_v6.txt
train_data_dir: ""
audio_embeds_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/whisper_new
audio_mel_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new
val_video_path: assets/demo1_video.mp4
val_audio_path: assets/demo1_audio.wav
batch_size: 2 # 8
num_workers: 11 # 11
num_frames: 16
resolution: 256
mask: fix_mask
audio_sample_rate: 16000
video_fps: 25
ckpt:
resume_ckpt_path: checkpoints/latentsync_unet.pt
save_ckpt_steps: 5000
run:
pixel_space_supervise: true
use_syncnet: true
sync_loss_weight: 0.05 # 1/283
perceptual_loss_weight: 0.1 # 0.1
recon_loss_weight: 1 # 1
guidance_scale: 1.0 # 1.5 or 1.0
trepa_loss_weight: 10
inference_steps: 20
seed: 1247
use_mixed_noise: true
mixed_noise_alpha: 1 # 1
mixed_precision_training: true
enable_gradient_checkpointing: false
enable_xformers_memory_efficient_attention: true
max_train_steps: 10000000
max_train_epochs: -1
optimizer:
lr: 1e-5
scale_lr: false
max_grad_norm: 1.0
lr_scheduler: constant
lr_warmup_steps: 0
model:
act_fn: silu
add_audio_layer: true
custom_audio_layer: false
audio_condition_method: cross_attn # Choose between [cross_attn, group_norm]
attention_head_dim: 8
block_out_channels: [320, 640, 1280, 1280]
center_input_sample: false
cross_attention_dim: 384
down_block_types:
[
"CrossAttnDownBlock3D",
"CrossAttnDownBlock3D",
"CrossAttnDownBlock3D",
"DownBlock3D",
]
mid_block_type: UNetMidBlock3DCrossAttn
up_block_types:
[
"UpBlock3D",
"CrossAttnUpBlock3D",
"CrossAttnUpBlock3D",
"CrossAttnUpBlock3D",
]
downsample_padding: 1
flip_sin_to_cos: true
freq_shift: 0
in_channels: 13 # 49
layers_per_block: 2
mid_block_scale_factor: 1
norm_eps: 1e-5
norm_num_groups: 32
out_channels: 4 # 16
sample_size: 64
resnet_time_scale_shift: default # Choose between [default, scale_shift]
unet_use_cross_frame_attention: false
unet_use_temporal_attention: false
# Actually we don't use the motion module in the final version of LatentSync
# When we started the project, we used the codebase of AnimateDiff and tried motion module, the results are poor
# We decied to leave the code here for possible future usage
use_motion_module: false
motion_module_resolutions: [1, 2, 4, 8]
motion_module_mid_block: false
motion_module_decoder_only: false
motion_module_type: Vanilla
motion_module_kwargs:
num_attention_heads: 8
num_transformer_block: 1
attention_block_types:
- Temporal_Self
- Temporal_Self
temporal_position_encoding: true
temporal_position_encoding_max_len: 16
temporal_attention_dim_div: 1
zero_initialize: true