radtts-uk-bigvgan / configs /radtts-pp-dap-model.json
Yehor's picture
Init
ea6a7ed
{
"train_config": {
"output_directory": "outdir_pp_model",
"epochs": 10000000,
"optim_algo": "RAdam",
"learning_rate": 0.001,
"weight_decay": 1e-06,
"sigma": 1.0,
"iters_per_checkpoint": 1000,
"batch_size": 16,
"seed": null,
"checkpoint_path": "",
"ignore_layers": [],
"ignore_layers_warmstart": [],
"finetune_layers": [],
"include_layers": [],
"vocoder_config_path": "models/hifigan_22khz_config.json",
"vocoder_checkpoint_path": "models/hifigan_ljs_generator_v1.pt",
"log_attribute_samples": true,
"log_decoder_samples": true,
"warmstart_checkpoint_path": "outdir_pp/model_100000",
"use_amp": true,
"grad_clip_val": 1.0,
"loss_weights": {
"blank_logprob": -1,
"ctc_loss_weight": 0.1,
"binarization_loss_weight": 1.0,
"dur_loss_weight": 1.0,
"f0_loss_weight": 1.0,
"energy_loss_weight": 1.0,
"vpred_loss_weight": 1.0
},
"binarization_start_iter": 0,
"kl_loss_start_iter": 0,
"unfreeze_modules": "all"
},
"data_config": {
"training_files": {
"LJS": {
"basedir": "filelists/",
"audiodir": "wavs",
"filelist": "3speakers_ukrainian_train_filelist_dc.txt",
"lmdbpath": ""
}
},
"validation_files": {
"LJS": {
"basedir": "filelists/",
"audiodir": "wavs",
"filelist": "3speakers_ukrainian_val_filelist_dc.txt",
"lmdbpath": ""
}
},
"dur_min": 0.1,
"dur_max": 10.2,
"sampling_rate": 22050,
"filter_length": 1024,
"hop_length": 256,
"win_length": 1024,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": 8000.0,
"f0_min": 80.0,
"f0_max": 640.0,
"max_wav_value": 32768.0,
"use_f0": true,
"use_log_f0": 0,
"use_energy_avg": true,
"use_scaled_energy": true,
"symbol_set": "ukrainian",
"cleaner_names": [
"ukrainian_cleaners"
],
"heteronyms_path": "tts_text_processing/heteronyms",
"phoneme_dict_path": "tts_text_processing/cmudict-0.7b",
"p_phoneme": 0.0,
"handle_phoneme": "word",
"handle_phoneme_ambiguous": "ignore",
"include_speakers": null,
"n_frames": -1,
"betabinom_cache_path": "/home/dmytro_chaplinsky/RAD-TTS/radtts-code/cache",
"lmdb_cache_path": "",
"use_attn_prior_masking": true,
"prepend_space_to_text": true,
"append_space_to_text": true,
"add_bos_eos_to_text": false,
"betabinom_scaling_factor": 1.0,
"distance_tx_unvoiced": false,
"mel_noise_scale": 0.0
},
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321"
},
"model_config": {
"n_speakers": 3,
"n_speaker_dim": 16,
"n_text": 185,
"n_text_dim": 512,
"n_flows": 8,
"n_conv_layers_per_step": 4,
"n_mel_channels": 80,
"n_hidden": 1024,
"mel_encoder_n_hidden": 512,
"dummy_speaker_embedding": false,
"n_early_size": 2,
"n_early_every": 2,
"n_group_size": 2,
"affine_model": "wavenet",
"include_modules": "decatndpmvpredapm",
"scaling_fn": "tanh",
"matrix_decomposition": "LUS",
"learn_alignments": true,
"use_speaker_emb_for_alignment": false,
"attn_straight_through_estimator": true,
"use_context_lstm": true,
"context_lstm_norm": "spectral",
"context_lstm_w_f0_and_energy": true,
"text_encoder_lstm_norm": "spectral",
"n_f0_dims": 1,
"n_energy_avg_dims": 1,
"use_first_order_features": false,
"unvoiced_bias_activation": "relu",
"decoder_use_partial_padding": true,
"decoder_use_unvoiced_bias": true,
"ap_pred_log_f0": true,
"ap_use_unvoiced_bias": false,
"ap_use_voiced_embeddings": true,
"dur_model_config": {
"name": "dap",
"hparams": {
"n_speaker_dim": 16,
"bottleneck_hparams": {
"in_dim": 512,
"reduction_factor": 16,
"norm": "weightnorm",
"non_linearity": "relu"
},
"take_log_of_input": true,
"arch_hparams": {
"out_dim": 1,
"n_layers": 2,
"n_channels": 256,
"kernel_size": 3,
"p_dropout": 0.25,
"in_dim": 48
}
}
},
"f0_model_config": {
"name": "dap",
"hparams": {
"n_speaker_dim": 16,
"bottleneck_hparams": {
"in_dim": 512,
"reduction_factor": 16,
"norm": "weightnorm",
"non_linearity": "relu"
},
"take_log_of_input": false,
"use_transformer": false,
"arch_hparams": {
"out_dim": 1,
"n_layers": 2,
"n_channels": 256,
"kernel_size": 11,
"p_dropout": 0.5,
"in_dim": 48
}
}
},
"energy_model_config": {
"name": "dap",
"hparams": {
"n_speaker_dim": 16,
"bottleneck_hparams": {
"in_dim": 512,
"reduction_factor": 16,
"norm": "weightnorm",
"non_linearity": "relu"
},
"take_log_of_input": false,
"use_transformer": false,
"arch_hparams": {
"out_dim": 1,
"n_layers": 2,
"n_channels": 256,
"kernel_size": 3,
"p_dropout": 0.25,
"in_dim": 48
}
}
},
"v_model_config": {
"name": "dap",
"hparams": {
"n_speaker_dim": 16,
"take_log_of_input": false,
"bottleneck_hparams": {
"in_dim": 512,
"reduction_factor": 16,
"norm": "weightnorm",
"non_linearity": "relu"
},
"arch_hparams": {
"out_dim": 1,
"n_layers": 2,
"n_channels": 256,
"kernel_size": 3,
"p_dropout": 0.5,
"lstm_type": "",
"use_linear": 1,
"in_dim": 48
}
}
}
}
}