Spaces:

svjack
/

LatentSync_ver_1_6

Running

LatentSync_ver_1_6 / configs /syncnet /syncnet_16_latent.yaml

Upload folder using huggingface_hub

bcb05c1 verified 6 months ago

1.24 kB

	model:
	audio_encoder: # input (1, 80, 52)
	in_channels: 1
	block_out_channels: [32, 64, 128, 256, 512, 1024]
	downsample_factors: [[2, 1], 2, 2, 2, 2, [2, 3]]
	attn_blocks: [0, 0, 0, 0, 0, 0]
	dropout: 0.0
	visual_encoder: # input (64, 32, 32)
	in_channels: 64
	block_out_channels: [64, 128, 256, 256, 512, 1024]
	downsample_factors: [2, 2, 2, 1, 2, 2]
	attn_blocks: [0, 0, 0, 0, 0, 0]
	dropout: 0.0

	ckpt:
	resume_ckpt_path: ""
	inference_ckpt_path: ""
	save_ckpt_steps: 2500

	data:
	train_output_dir: output/syncnet
	num_val_samples: 1200
	batch_size: 120 # 40
	num_workers: 11 # 11
	latent_space: true
	num_frames: 16
	resolution: 256
	train_fileslist: ""
	train_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/train
	val_fileslist: ""
	val_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/val
	audio_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new
	lower_half: false
	pretrained_audio_model_path: facebook/wav2vec2-large-xlsr-53
	audio_sample_rate: 16000
	video_fps: 25

	optimizer:
	lr: 1e-5
	max_grad_norm: 1.0

	run:
	max_train_steps: 10000000
	validation_steps: 2500
	mixed_precision_training: true
	seed: 42