ndkhanh95
/

LatentSync

Model card Files Files and versions Community

LatentSync / configs /unet /second_stage.yaml

ndkhanh95's picture

Upload 170 files

aa0c2cb verified about 1 month ago

history blame contribute delete

3.03 kB

	data:
	syncnet_config_path: configs/syncnet/syncnet_16_pixel.yaml
	train_output_dir: debug/unet
	train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/all_data_v6.txt
	train_data_dir: ""
	audio_embeds_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/whisper_new
	audio_mel_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel_new

	val_video_path: assets/demo1_video.mp4
	val_audio_path: assets/demo1_audio.wav
	batch_size: 2 # 8
	num_workers: 11 # 11
	num_frames: 16
	resolution: 256
	mask: fix_mask
	audio_sample_rate: 16000
	video_fps: 25

	ckpt:
	resume_ckpt_path: checkpoints/latentsync_unet.pt
	save_ckpt_steps: 5000

	run:
	pixel_space_supervise: true
	use_syncnet: true
	sync_loss_weight: 0.05 # 1/283
	perceptual_loss_weight: 0.1 # 0.1
	recon_loss_weight: 1 # 1
	guidance_scale: 1.0 # 1.5 or 1.0
	trepa_loss_weight: 10
	inference_steps: 20
	seed: 1247
	use_mixed_noise: true
	mixed_noise_alpha: 1 # 1
	mixed_precision_training: true
	enable_gradient_checkpointing: false
	enable_xformers_memory_efficient_attention: true
	max_train_steps: 10000000
	max_train_epochs: -1

	optimizer:
	lr: 1e-5
	scale_lr: false
	max_grad_norm: 1.0
	lr_scheduler: constant
	lr_warmup_steps: 0

	model:
	act_fn: silu
	add_audio_layer: true
	custom_audio_layer: false
	audio_condition_method: cross_attn # Choose between [cross_attn, group_norm]
	attention_head_dim: 8
	block_out_channels: [320, 640, 1280, 1280]
	center_input_sample: false
	cross_attention_dim: 384
	down_block_types:
	[
	"CrossAttnDownBlock3D",
	"CrossAttnDownBlock3D",
	"CrossAttnDownBlock3D",
	"DownBlock3D",
	]
	mid_block_type: UNetMidBlock3DCrossAttn
	up_block_types:
	[
	"UpBlock3D",
	"CrossAttnUpBlock3D",
	"CrossAttnUpBlock3D",
	"CrossAttnUpBlock3D",
	]
	downsample_padding: 1
	flip_sin_to_cos: true
	freq_shift: 0
	in_channels: 13 # 49
	layers_per_block: 2
	mid_block_scale_factor: 1
	norm_eps: 1e-5
	norm_num_groups: 32
	out_channels: 4 # 16
	sample_size: 64
	resnet_time_scale_shift: default # Choose between [default, scale_shift]
	unet_use_cross_frame_attention: false
	unet_use_temporal_attention: false

	# Actually we don't use the motion module in the final version of LatentSync
	# When we started the project, we used the codebase of AnimateDiff and tried motion module, the results are poor
	# We decied to leave the code here for possible future usage
	use_motion_module: false
	motion_module_resolutions: [1, 2, 4, 8]
	motion_module_mid_block: false
	motion_module_decoder_only: false
	motion_module_type: Vanilla
	motion_module_kwargs:
	num_attention_heads: 8
	num_transformer_block: 1
	attention_block_types:
	- Temporal_Self
	- Temporal_Self
	temporal_position_encoding: true
	temporal_position_encoding_max_len: 16
	temporal_attention_dim_div: 1
	zero_initialize: true