liuhuadai
/

AudioLCM

+model:
+  base_learning_rate: 3.0e-06
+  target: ldm.models.diffusion.lcm_audio.LCM_audio
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: image
+    cond_stage_key: caption
+    mel_dim: 20
+    mel_length: 312
+    channels: 0
+    cond_stage_trainable: False
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_by_std: true
+    use_lcm: True
+    num_ddim_timesteps: 50
+    w_min: 4
+    w_max: 12
+    ckpt_path: ./useful_ckpt/LCM_audio/maa2.ckpt
+    use_ema: false
+    scheduler_config:
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps:
+        - 10000
+        cycle_lengths:
+        - 10000000000000
+        f_start:
+        - 1.0e-06
+        f_max:
+        - 1.0
+        f_min:
+        - 1.0
+    unet_config:
+      target: ldm.modules.diffusionmodules.concatDiT.ConcatDiT2MLP
+      params:
+        in_channels: 20
+        context_dim: 1024
+        hidden_size: 576
+        num_heads: 8
+        depth: 4
+        max_len: 1000
+    first_stage_config:
+      target: ldm.models.autoencoder1d.AutoencoderKL
+      params:
+        embed_dim: 20
+        monitor: val/rec_loss
+        ckpt_path: ./useful_ckpt/AutoencoderKL/epoch=000032.ckpt
+        ddconfig:
+          double_z: true
+          in_channels: 80
+          out_ch: 80
+          z_channels: 20
+          kernel_size: 5
+          ch: 384
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          num_res_blocks: 2
+          attn_layers:
+          - 3
+          down_layers:
+          - 0
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLAPFLANEmbedder
+      params:
+        weights_path: ./useful_ckpt/FrozenCLAPFLANEmbedder/CLAP_weights_2022.pth
+lightning:
+  callbacks:
+    image_logger:
+      target: main.AudioLogger
+      params:
+        sample_rate: 16000
+        for_specs: true
+        increase_log_steps: false
+        batch_frequency: 5000
+        max_images: 8
+        melvmin: -5
+        melvmax: 1.5
+        vocoder_cfg:
+          target: vocoder.bigvgan.models.VocoderBigVGAN
+          params:
+            ckpt_vocoder: ./useful_ckpt/vocoder/logs/bigvnat16k93.5w
+  trainer:
+    benchmark: True
+    gradient_clip_val: 1.0
+    replace_sampler_ddp: false
+    max_epochs: 100
+  modelcheckpoint:
+    params:
+      monitor: epoch
+      mode: max
+      # every_n_train_steps: 2000
+      save_top_k: 100
+      every_n_epochs: 3
+data:
+  target: main.SpectrogramDataModuleFromConfig
+  params:
+    batch_size: 8
+    num_workers: 32
+    spec_dir_path: 'ldm/data/tsv_dirs/full_data/caps_struct'
+    mel_num: 80
+    train:
+      target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsTrain
+      params:
+        specs_dataset_cfg:
+    validation:
+      target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsValidation
+      params:
+        specs_dataset_cfg:
+test_dataset:
+  target: ldm.data.tsvdataset.TSVDatasetStruct
+  params:
+    tsv_path: audiocaps_test_16000_struct.tsv
+    spec_crop_len: 624