|
model: |
|
audio_encoder: |
|
in_channels: 1 |
|
block_out_channels: [64, 128, 256, 256, 512, 1024] |
|
downsample_factors: [2, 2, 2, 2, 2, 2] |
|
dropout: 0.0 |
|
visual_encoder: |
|
in_channels: 75 |
|
block_out_channels: [128, 128, 256, 256, 512, 512, 1024, 1024] |
|
downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2] |
|
dropout: 0.0 |
|
|
|
ckpt: |
|
resume_ckpt_path: "" |
|
inference_ckpt_path: "" |
|
save_ckpt_steps: 2500 |
|
|
|
data: |
|
train_output_dir: debug/syncnet |
|
num_val_samples: 2048 |
|
batch_size: 64 |
|
num_workers: 11 |
|
latent_space: false |
|
num_frames: 25 |
|
resolution: 256 |
|
train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/hdtf_vox_avatars_ads_affine.txt |
|
|
|
train_data_dir: "" |
|
val_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/vox_affine_val.txt |
|
|
|
val_data_dir: "" |
|
audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel |
|
lower_half: true |
|
pretrained_audio_model_path: facebook/wav2vec2-large-xlsr-53 |
|
audio_sample_rate: 16000 |
|
video_fps: 25 |
|
|
|
optimizer: |
|
lr: 1e-5 |
|
max_grad_norm: 1.0 |
|
|
|
run: |
|
max_train_steps: 10000000 |
|
mixed_precision_training: true |
|
seed: 42 |
|
|