ndkhanh95
/

LatentSync

LatentSync / configs /syncnet /syncnet_25_pixel.yaml

Upload 170 files

aa0c2cb verified about 1 month ago

1.35 kB

	model:
	audio_encoder: # input (1, 80, 80)
	in_channels: 1
	block_out_channels: [64, 128, 256, 256, 512, 1024]
	downsample_factors: [2, 2, 2, 2, 2, 2]
	dropout: 0.0
	visual_encoder: # input (75, 128, 256)
	in_channels: 75
	block_out_channels: [128, 128, 256, 256, 512, 512, 1024, 1024]
	downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2]
	dropout: 0.0

	ckpt:
	resume_ckpt_path: ""
	inference_ckpt_path: ""
	save_ckpt_steps: 2500

	data:
	train_output_dir: debug/syncnet
	num_val_samples: 2048
	batch_size: 64 # 64
	num_workers: 11 # 11
	latent_space: false
	num_frames: 25
	resolution: 256
	train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/hdtf_vox_avatars_ads_affine.txt
	# /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/hdtf_voxceleb_avatars_affine.txt
	train_data_dir: ""
	val_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/vox_affine_val.txt
	# /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/voxceleb_val.txt
	val_data_dir: ""
	audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel
	lower_half: true
	pretrained_audio_model_path: facebook/wav2vec2-large-xlsr-53
	audio_sample_rate: 16000
	video_fps: 25

	optimizer:
	lr: 1e-5
	max_grad_norm: 1.0

	run:
	max_train_steps: 10000000
	mixed_precision_training: true
	seed: 42