data: train_bs: 4 val_bs: 1 train_width: 512 train_height: 512 fps: 25 sample_rate: 16000 n_motion_frames: 2 n_sample_frames: 16 audio_margin: 2 train_meta_paths: - "./data/inference.json" wav2vec_config: audio_type: "vocals" # audio vocals model_scale: "base" # base large features: "all" # last avg all model_path: ./pretrained_models/chinese-wav2vec2-base audio_separator: model_path: ./pretrained_models/audio_separator/Kim_Vocal_2.onnx face_expand_ratio: 1.2 solver: gradient_accumulation_steps: 1 mixed_precision: "no" enable_xformers_memory_efficient_attention: True gradient_checkpointing: True max_train_steps: 30000 max_grad_norm: 1.0 # lr learning_rate: 1e-5 scale_lr: False lr_warmup_steps: 1 lr_scheduler: "constant" # optimizer use_8bit_adam: True adam_beta1: 0.9 adam_beta2: 0.999 adam_weight_decay: 1.0e-2 adam_epsilon: 1.0e-8 val: validation_steps: 1000 noise_scheduler_kwargs: num_train_timesteps: 1000 beta_start: 0.00085 beta_end: 0.012 beta_schedule: "linear" steps_offset: 1 clip_sample: false unet_additional_kwargs: use_inflated_groupnorm: true unet_use_cross_frame_attention: false unet_use_temporal_attention: false use_motion_module: true use_audio_module: true motion_module_resolutions: - 1 - 2 - 4 - 8 motion_module_mid_block: true motion_module_decoder_only: false motion_module_type: Vanilla motion_module_kwargs: num_attention_heads: 8 num_transformer_block: 1 attention_block_types: - Temporal_Self - Temporal_Self temporal_position_encoding: true temporal_position_encoding_max_len: 32 temporal_attention_dim_div: 1 audio_attention_dim: 768 stack_enable_blocks_name: - "up" - "down" - "mid" stack_enable_blocks_depth: [0,1,2,3] trainable_para: - audio_modules - motion_modules base_model_path: "./pretrained_models/stable-diffusion-v1-5" vae_model_path: "./pretrained_models/sd-vae-ft-mse" face_analysis_model_path: "./pretrained_models/face_analysis" mm_path: "./pretrained_models/motion_module/mm_sd_v15_v2.ckpt" weight_dtype: "fp16" # [fp16, fp32] uncond_img_ratio: 0.05 uncond_audio_ratio: 0.05 uncond_ia_ratio: 0.05 start_ratio: 0.05 noise_offset: 0.05 snr_gamma: 5.0 enable_zero_snr: True stage1_ckpt_dir: "./exp_output/stage1/" single_inference_times: 10 inference_steps: 40 cfg_scale: 3.5 seed: 42 resume_from_checkpoint: "latest" checkpointing_steps: 500 exp_name: "joyhallo" output_dir: "./opts" audio_ckpt_dir: "./pretrained_models/joyhallo/net.pth" ref_img_path: - "examples/reference_images/1.jpg" - "examples/reference_images/2.jpg" - "examples/reference_images/3.jpg" - "examples/reference_images/4.jpg" - "examples/reference_images/5.jpg" - "examples/reference_images/6.jpg" - "examples/reference_images/7.jpg" audio_path: - "examples/driving_audios/0.wav" - "examples/driving_audios/0.wav" - "examples/driving_audios/0.wav" - "examples/driving_audios/0.wav" - "examples/driving_audios/0.wav" - "examples/driving_audios/0.wav" - "examples/driving_audios/0.wav"