|
data: |
|
train_bs: 4 |
|
val_bs: 1 |
|
train_width: 512 |
|
train_height: 512 |
|
fps: 25 |
|
sample_rate: 16000 |
|
n_motion_frames: 2 |
|
n_sample_frames: 16 |
|
audio_margin: 2 |
|
train_meta_paths: |
|
- "./data/inference.json" |
|
|
|
wav2vec_config: |
|
audio_type: "vocals" # audio vocals |
|
model_scale: "base" # base large |
|
features: "all" # last avg all |
|
model_path: ./pretrained_models/chinese-wav2vec2-base |
|
audio_separator: |
|
model_path: ./pretrained_models/audio_separator/Kim_Vocal_2.onnx |
|
face_expand_ratio: 1.2 |
|
|
|
solver: |
|
gradient_accumulation_steps: 1 |
|
mixed_precision: "no" |
|
enable_xformers_memory_efficient_attention: True |
|
gradient_checkpointing: True |
|
max_train_steps: 30000 |
|
max_grad_norm: 1.0 |
|
# lr |
|
learning_rate: 1e-5 |
|
scale_lr: False |
|
lr_warmup_steps: 1 |
|
lr_scheduler: "constant" |
|
|
|
# optimizer |
|
use_8bit_adam: True |
|
adam_beta1: 0.9 |
|
adam_beta2: 0.999 |
|
adam_weight_decay: 1.0e-2 |
|
adam_epsilon: 1.0e-8 |
|
|
|
val: |
|
validation_steps: 1000 |
|
|
|
noise_scheduler_kwargs: |
|
num_train_timesteps: 1000 |
|
beta_start: 0.00085 |
|
beta_end: 0.012 |
|
beta_schedule: "linear" |
|
steps_offset: 1 |
|
clip_sample: false |
|
|
|
unet_additional_kwargs: |
|
use_inflated_groupnorm: true |
|
unet_use_cross_frame_attention: false |
|
unet_use_temporal_attention: false |
|
use_motion_module: true |
|
use_audio_module: true |
|
motion_module_resolutions: |
|
- 1 |
|
- 2 |
|
- 4 |
|
- 8 |
|
motion_module_mid_block: true |
|
motion_module_decoder_only: false |
|
motion_module_type: Vanilla |
|
motion_module_kwargs: |
|
num_attention_heads: 8 |
|
num_transformer_block: 1 |
|
attention_block_types: |
|
- Temporal_Self |
|
- Temporal_Self |
|
temporal_position_encoding: true |
|
temporal_position_encoding_max_len: 32 |
|
temporal_attention_dim_div: 1 |
|
audio_attention_dim: 768 |
|
stack_enable_blocks_name: |
|
- "up" |
|
- "down" |
|
- "mid" |
|
stack_enable_blocks_depth: [0,1,2,3] |
|
|
|
trainable_para: |
|
- audio_modules |
|
- motion_modules |
|
|
|
base_model_path: "./pretrained_models/stable-diffusion-v1-5" |
|
vae_model_path: "./pretrained_models/sd-vae-ft-mse" |
|
face_analysis_model_path: "./pretrained_models/face_analysis" |
|
mm_path: "./pretrained_models/motion_module/mm_sd_v15_v2.ckpt" |
|
|
|
weight_dtype: "fp16" # [fp16, fp32] |
|
uncond_img_ratio: 0.05 |
|
uncond_audio_ratio: 0.05 |
|
uncond_ia_ratio: 0.05 |
|
start_ratio: 0.05 |
|
noise_offset: 0.05 |
|
snr_gamma: 5.0 |
|
enable_zero_snr: True |
|
stage1_ckpt_dir: "./exp_output/stage1/" |
|
|
|
single_inference_times: 10 |
|
inference_steps: 40 |
|
cfg_scale: 3.5 |
|
|
|
seed: 42 |
|
resume_from_checkpoint: "latest" |
|
checkpointing_steps: 500 |
|
|
|
exp_name: "joyhallo" |
|
output_dir: "./opts" |
|
|
|
audio_ckpt_dir: "./pretrained_models/joyhallo/net.pth" |
|
|
|
ref_img_path: |
|
- "examples/reference_images/1.jpg" |
|
- "examples/reference_images/2.jpg" |
|
- "examples/reference_images/3.jpg" |
|
- "examples/reference_images/4.jpg" |
|
- "examples/reference_images/5.jpg" |
|
- "examples/reference_images/6.jpg" |
|
- "examples/reference_images/7.jpg" |
|
|
|
audio_path: |
|
- "examples/driving_audios/0.wav" |
|
- "examples/driving_audios/0.wav" |
|
- "examples/driving_audios/0.wav" |
|
- "examples/driving_audios/0.wav" |
|
- "examples/driving_audios/0.wav" |
|
- "examples/driving_audios/0.wav" |
|
- "examples/driving_audios/0.wav" |