Spaces:

pillaryao
/

demo

Running

demo / configs /syncnet /syncnet_16_pixel.yaml

Upload folder using huggingface_hub

acecd1b verified 15 days ago

1.25 kB

	model:
	audio_encoder: # input (1, 80, 52)
	in_channels: 1
	block_out_channels: [32, 64, 128, 256, 512, 1024, 2048]
	downsample_factors: [[2, 1], 2, 2, 1, 2, 2, [2, 3]]
	attn_blocks: [0, 0, 0, 0, 0, 0, 0]
	dropout: 0.0
	visual_encoder: # input (48, 128, 256)
	in_channels: 48
	block_out_channels: [64, 128, 256, 256, 512, 1024, 2048, 2048]
	downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2]
	attn_blocks: [0, 0, 0, 0, 0, 0, 0, 0]
	dropout: 0.0

	ckpt:
	resume_ckpt_path: ""
	inference_ckpt_path: checkpoints/latentsync_syncnet.pt
	save_ckpt_steps: 2500

	data:
	train_output_dir: debug/syncnet
	num_val_samples: 2048
	batch_size: 128 # 128
	num_workers: 11 # 11
	latent_space: false
	num_frames: 16
	resolution: 256
	train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/all_data_v6.txt
	train_data_dir: ""
	val_fileslist: ""
	val_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/val
	audio_mel_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new
	lower_half: true
	audio_sample_rate: 16000
	video_fps: 25

	optimizer:
	lr: 1e-5
	max_grad_norm: 1.0

	run:
	max_train_steps: 10000000
	validation_steps: 2500
	mixed_precision_training: true
	seed: 42