|
model:
|
|
audio_encoder:
|
|
in_channels: 1
|
|
block_out_channels: [32, 64, 128, 256, 512, 1024]
|
|
downsample_factors: [[2, 1], 2, 2, 2, 2, [2, 3]]
|
|
attn_blocks: [0, 0, 0, 0, 0, 0]
|
|
dropout: 0.0
|
|
visual_encoder:
|
|
in_channels: 64
|
|
block_out_channels: [64, 128, 256, 256, 512, 1024]
|
|
downsample_factors: [2, 2, 2, 1, 2, 2]
|
|
attn_blocks: [0, 0, 0, 0, 0, 0]
|
|
dropout: 0.0
|
|
|
|
ckpt:
|
|
resume_ckpt_path: ""
|
|
inference_ckpt_path: ""
|
|
save_ckpt_steps: 2500
|
|
|
|
data:
|
|
train_output_dir: output/syncnet
|
|
num_val_samples: 1200
|
|
batch_size: 120
|
|
num_workers: 11
|
|
latent_space: true
|
|
num_frames: 16
|
|
resolution: 256
|
|
train_fileslist: ""
|
|
train_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/train
|
|
val_fileslist: ""
|
|
val_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/val
|
|
audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new
|
|
lower_half: false
|
|
pretrained_audio_model_path: facebook/wav2vec2-large-xlsr-53
|
|
audio_sample_rate: 16000
|
|
video_fps: 25
|
|
|
|
optimizer:
|
|
lr: 1e-5
|
|
max_grad_norm: 1.0
|
|
|
|
run:
|
|
max_train_steps: 10000000
|
|
validation_steps: 2500
|
|
mixed_precision_training: true
|
|
seed: 42
|
|
|