|
model: |
|
audio_encoder: |
|
in_channels: 1 |
|
block_out_channels: [32, 64, 128, 256, 512, 1024, 2048] |
|
downsample_factors: [[2, 1], 2, 2, 1, 2, 2, [2, 3]] |
|
attn_blocks: [0, 0, 0, 0, 0, 0, 0] |
|
dropout: 0.0 |
|
visual_encoder: |
|
in_channels: 48 |
|
block_out_channels: [64, 128, 256, 256, 512, 1024, 2048, 2048] |
|
downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2] |
|
attn_blocks: [0, 0, 0, 0, 0, 0, 0, 0] |
|
dropout: 0.0 |
|
|
|
ckpt: |
|
resume_ckpt_path: "" |
|
inference_ckpt_path: checkpoints/latentsync_syncnet.pt |
|
save_ckpt_steps: 2500 |
|
|
|
data: |
|
train_output_dir: debug/syncnet |
|
num_val_samples: 2048 |
|
batch_size: 128 |
|
num_workers: 11 |
|
latent_space: false |
|
num_frames: 16 |
|
resolution: 256 |
|
train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/all_data_v6.txt |
|
train_data_dir: "" |
|
val_fileslist: "" |
|
val_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/val |
|
audio_mel_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new |
|
lower_half: true |
|
audio_sample_rate: 16000 |
|
video_fps: 25 |
|
|
|
optimizer: |
|
lr: 1e-5 |
|
max_grad_norm: 1.0 |
|
|
|
run: |
|
max_train_steps: 10000000 |
|
validation_steps: 2500 |
|
mixed_precision_training: true |
|
seed: 42 |
|
|