Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,137 Bytes
28cda0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
model:
base_learning_rate: 3.0e-06
target: ldm.models.diffusion.cfm1_audio.CFM
params:
linear_start: 0.00085
linear_end: 0.012
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: image
cond_stage_key: caption
mel_dim: 20
mel_length: 256
channels: 0
cond_stage_trainable: True
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_by_std: true
use_ema: false
scheduler_config:
target: ldm.lr_scheduler.LambdaLinearScheduler
params:
warm_up_steps:
- 10000
cycle_lengths:
- 10000000000000
f_start:
- 1.0e-06
f_max:
- 1.0
f_min:
- 1.0
unet_config:
target: ldm.modules.diffusionmodules.flag_large_dit.TxtFlagLargeImprovedDiTV2
params:
in_channels: 20
context_dim: 1024
hidden_size: 768
num_heads: 32
depth: 16
max_len: 1000
first_stage_config:
target: ldm.models.autoencoder1d.AutoencoderKL
params:
embed_dim: 20
monitor: val/rec_loss
ckpt_path: useful_ckpts/maa2/maa2.ckpt
ddconfig:
double_z: true
in_channels: 80
out_ch: 80
z_channels: 20
kernel_size: 5
ch: 384
ch_mult:
- 1
- 2
- 4
num_res_blocks: 2
attn_layers:
- 3
down_layers:
- 0
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: ldm.modules.encoders.modules.FrozenFLANEmbedder
# params:
# version: /root/autodl-tmp/liuhuadai/audio_util/Make-An-Audio-3/useful_ckpts/flan-t5-large/pytorch_model.bin
lightning:
callbacks:
image_logger:
target: main.AudioLogger
params:
sample_rate: 16000
for_specs: true
increase_log_steps: false
batch_frequency: 5000
max_images: 8
melvmin: -5
melvmax: 1.5
vocoder_cfg:
target: vocoder.bigvgan.models.VocoderBigVGAN
params:
ckpt_vocoder: useful_ckpts/bigvnat
trainer:
benchmark: True
gradient_clip_val: 1.0
limit_val_batches: 0.0
modelcheckpoint:
params:
monitor: epoch
mode: max
save_top_k: 10
every_n_epochs: 5
data:
target: main.DataModuleFromConfig
params:
batch_size: 16
num_workers: 20
wrap: True
train:
target: ldm.data.txt_spec_dataset.spec_join_Dataset_audioset_Train
params:
dataset_cfg:
dataset_name: data/audioset_new_intern.tsv
spec_crop_len: 624
drop: 0.2
validation:
target: ldm.data.txt_spec_dataset.spec_join_Dataset_audioset_Valid
params:
dataset_cfg:
dataset_name: data/audioset_new_intern.tsv
spec_crop_len: 624
drop: 0.0
test_dataset:
target: ldm.data.joinaudiodataset_struct_sample_anylen.TestManifest
params:
manifest: data/musiccaps_test_16000_struct.tsv
spec_crop_len: 624
|