|
|
metadata_root: "./data/metadata/dataset_root.json" |
|
|
log_directory: "./log/latent_diffusion" |
|
|
project: "audioldm" |
|
|
precision: "high" |
|
|
|
|
|
variables: |
|
|
sampling_rate: &sampling_rate 16000 |
|
|
mel_bins: &mel_bins 64 |
|
|
latent_embed_dim: &latent_embed_dim 8 |
|
|
latent_t_size: &latent_t_size 256 |
|
|
latent_f_size: &latent_f_size 16 |
|
|
in_channels: &unet_in_channels 8 |
|
|
optimize_ddpm_parameter: &optimize_ddpm_parameter true |
|
|
optimize_gpt: &optimize_gpt true |
|
|
warmup_steps: &warmup_steps 2000 |
|
|
|
|
|
data: |
|
|
train: ["audiocaps"] |
|
|
val: "audiocaps" |
|
|
test: "audiocaps" |
|
|
class_label_indices: "audioset_eval_subset" |
|
|
dataloader_add_ons: [] |
|
|
|
|
|
step: |
|
|
validation_every_n_epochs: 5 |
|
|
save_checkpoint_every_n_steps: 40000 |
|
|
max_steps: 800000 |
|
|
save_top_k: 2 |
|
|
|
|
|
preprocessing: |
|
|
audio: |
|
|
sampling_rate: *sampling_rate |
|
|
max_wav_value: 32768.0 |
|
|
duration: 10.24 |
|
|
stft: |
|
|
filter_length: 1024 |
|
|
hop_length: 160 |
|
|
win_length: 1024 |
|
|
mel: |
|
|
n_mel_channels: *mel_bins |
|
|
mel_fmin: 0 |
|
|
mel_fmax: 8000 |
|
|
num_workers: 2 |
|
|
|
|
|
augmentation: |
|
|
mixup: 0.0 |
|
|
|
|
|
addition_params: |
|
|
gradient_accumulation_steps: 1 |
|
|
|
|
|
model: |
|
|
target: audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion |
|
|
params: |
|
|
|
|
|
first_stage_config: |
|
|
base_learning_rate: 8.0e-06 |
|
|
target: audioldm_train.modules.latent_encoder.autoencoder.AutoencoderKL |
|
|
params: |
|
|
reload_from_ckpt: "ckpts/vae_mel_16k_64bins.ckpt" |
|
|
sampling_rate: *sampling_rate |
|
|
batchsize: 3 |
|
|
monitor: val/rec_loss |
|
|
image_key: fbank |
|
|
subband: 1 |
|
|
embed_dim: *latent_embed_dim |
|
|
time_shuffle: 1 |
|
|
lossconfig: |
|
|
target: audioldm_train.losses.LPIPSWithDiscriminator |
|
|
params: |
|
|
disc_start: 50001 |
|
|
kl_weight: 1000.0 |
|
|
disc_weight: 0.5 |
|
|
disc_in_channels: 1 |
|
|
ddconfig: |
|
|
double_z: true |
|
|
mel_bins: *mel_bins |
|
|
z_channels: 8 |
|
|
resolution: 256 |
|
|
downsample_time: false |
|
|
in_channels: 1 |
|
|
out_ch: 1 |
|
|
ch: 128 |
|
|
ch_mult: |
|
|
- 1 |
|
|
- 2 |
|
|
- 4 |
|
|
num_res_blocks: 2 |
|
|
attn_resolutions: [] |
|
|
dropout: 0.0 |
|
|
|
|
|
|
|
|
base_learning_rate: 1.0e-4 |
|
|
warmup_steps: *warmup_steps |
|
|
optimize_ddpm_parameter: *optimize_ddpm_parameter |
|
|
sampling_rate: *sampling_rate |
|
|
batchsize: 1 |
|
|
linear_start: 0.0015 |
|
|
linear_end: 0.0195 |
|
|
num_timesteps_cond: 1 |
|
|
log_every_t: 200 |
|
|
timesteps: 1000 |
|
|
unconditional_prob_cfg: 0.1 |
|
|
parameterization: eps |
|
|
first_stage_key: fbank |
|
|
latent_t_size: *latent_t_size |
|
|
latent_f_size: *latent_f_size |
|
|
channels: *latent_embed_dim |
|
|
monitor: val/loss_simple_ema |
|
|
scale_by_std: true |
|
|
unet_config: |
|
|
target: audioldm_train.modules.diffusionmodules.openaimodel.UNetModel |
|
|
params: |
|
|
image_size: 64 |
|
|
extra_film_condition_dim: 512 |
|
|
|
|
|
|
|
|
in_channels: *unet_in_channels |
|
|
out_channels: *latent_embed_dim |
|
|
model_channels: 128 |
|
|
attention_resolutions: |
|
|
- 8 |
|
|
- 4 |
|
|
- 2 |
|
|
num_res_blocks: 2 |
|
|
channel_mult: |
|
|
- 1 |
|
|
- 2 |
|
|
- 3 |
|
|
- 5 |
|
|
num_head_channels: 32 |
|
|
use_spatial_transformer: true |
|
|
transformer_depth: 1 |
|
|
extra_sa_layer: false |
|
|
|
|
|
cond_stage_config: |
|
|
film_clap_cond1: |
|
|
cond_stage_key: text |
|
|
conditioning_key: film |
|
|
target: audioldm_train.conditional_models.CLAPAudioEmbeddingClassifierFreev2 |
|
|
params: |
|
|
pretrained_path: ckpts/clap_htsat_tiny.pt |
|
|
sampling_rate: 16000 |
|
|
embed_mode: text |
|
|
amodel: HTSAT-tiny |
|
|
|
|
|
evaluation_params: |
|
|
unconditional_guidance_scale: 3.5 |
|
|
ddim_sampling_steps: 200 |
|
|
n_candidates_per_samples: 3 |
|
|
|