Spaces:
Running
Running
{ | |
"train_config": { | |
"output_directory": "outdir_pp_model", | |
"epochs": 10000000, | |
"optim_algo": "RAdam", | |
"learning_rate": 0.001, | |
"weight_decay": 1e-06, | |
"sigma": 1.0, | |
"iters_per_checkpoint": 1000, | |
"batch_size": 16, | |
"seed": null, | |
"checkpoint_path": "", | |
"ignore_layers": [], | |
"ignore_layers_warmstart": [], | |
"finetune_layers": [], | |
"include_layers": [], | |
"vocoder_config_path": "models/hifigan_22khz_config.json", | |
"vocoder_checkpoint_path": "models/hifigan_ljs_generator_v1.pt", | |
"log_attribute_samples": true, | |
"log_decoder_samples": true, | |
"warmstart_checkpoint_path": "outdir_pp/model_100000", | |
"use_amp": true, | |
"grad_clip_val": 1.0, | |
"loss_weights": { | |
"blank_logprob": -1, | |
"ctc_loss_weight": 0.1, | |
"binarization_loss_weight": 1.0, | |
"dur_loss_weight": 1.0, | |
"f0_loss_weight": 1.0, | |
"energy_loss_weight": 1.0, | |
"vpred_loss_weight": 1.0 | |
}, | |
"binarization_start_iter": 0, | |
"kl_loss_start_iter": 0, | |
"unfreeze_modules": "all" | |
}, | |
"data_config": { | |
"training_files": { | |
"LJS": { | |
"basedir": "filelists/", | |
"audiodir": "wavs", | |
"filelist": "3speakers_ukrainian_train_filelist_dc.txt", | |
"lmdbpath": "" | |
} | |
}, | |
"validation_files": { | |
"LJS": { | |
"basedir": "filelists/", | |
"audiodir": "wavs", | |
"filelist": "3speakers_ukrainian_val_filelist_dc.txt", | |
"lmdbpath": "" | |
} | |
}, | |
"dur_min": 0.1, | |
"dur_max": 10.2, | |
"sampling_rate": 22050, | |
"filter_length": 1024, | |
"hop_length": 256, | |
"win_length": 1024, | |
"n_mel_channels": 80, | |
"mel_fmin": 0.0, | |
"mel_fmax": 8000.0, | |
"f0_min": 80.0, | |
"f0_max": 640.0, | |
"max_wav_value": 32768.0, | |
"use_f0": true, | |
"use_log_f0": 0, | |
"use_energy_avg": true, | |
"use_scaled_energy": true, | |
"symbol_set": "ukrainian", | |
"cleaner_names": [ | |
"ukrainian_cleaners" | |
], | |
"heteronyms_path": "tts_text_processing/heteronyms", | |
"phoneme_dict_path": "tts_text_processing/cmudict-0.7b", | |
"p_phoneme": 0.0, | |
"handle_phoneme": "word", | |
"handle_phoneme_ambiguous": "ignore", | |
"include_speakers": null, | |
"n_frames": -1, | |
"betabinom_cache_path": "/home/dmytro_chaplinsky/RAD-TTS/radtts-code/cache", | |
"lmdb_cache_path": "", | |
"use_attn_prior_masking": true, | |
"prepend_space_to_text": true, | |
"append_space_to_text": true, | |
"add_bos_eos_to_text": false, | |
"betabinom_scaling_factor": 1.0, | |
"distance_tx_unvoiced": false, | |
"mel_noise_scale": 0.0 | |
}, | |
"dist_config": { | |
"dist_backend": "nccl", | |
"dist_url": "tcp://localhost:54321" | |
}, | |
"model_config": { | |
"n_speakers": 3, | |
"n_speaker_dim": 16, | |
"n_text": 185, | |
"n_text_dim": 512, | |
"n_flows": 8, | |
"n_conv_layers_per_step": 4, | |
"n_mel_channels": 80, | |
"n_hidden": 1024, | |
"mel_encoder_n_hidden": 512, | |
"dummy_speaker_embedding": false, | |
"n_early_size": 2, | |
"n_early_every": 2, | |
"n_group_size": 2, | |
"affine_model": "wavenet", | |
"include_modules": "decatndpmvpredapm", | |
"scaling_fn": "tanh", | |
"matrix_decomposition": "LUS", | |
"learn_alignments": true, | |
"use_speaker_emb_for_alignment": false, | |
"attn_straight_through_estimator": true, | |
"use_context_lstm": true, | |
"context_lstm_norm": "spectral", | |
"context_lstm_w_f0_and_energy": true, | |
"text_encoder_lstm_norm": "spectral", | |
"n_f0_dims": 1, | |
"n_energy_avg_dims": 1, | |
"use_first_order_features": false, | |
"unvoiced_bias_activation": "relu", | |
"decoder_use_partial_padding": true, | |
"decoder_use_unvoiced_bias": true, | |
"ap_pred_log_f0": true, | |
"ap_use_unvoiced_bias": false, | |
"ap_use_voiced_embeddings": true, | |
"dur_model_config": { | |
"name": "dap", | |
"hparams": { | |
"n_speaker_dim": 16, | |
"bottleneck_hparams": { | |
"in_dim": 512, | |
"reduction_factor": 16, | |
"norm": "weightnorm", | |
"non_linearity": "relu" | |
}, | |
"take_log_of_input": true, | |
"arch_hparams": { | |
"out_dim": 1, | |
"n_layers": 2, | |
"n_channels": 256, | |
"kernel_size": 3, | |
"p_dropout": 0.25, | |
"in_dim": 48 | |
} | |
} | |
}, | |
"f0_model_config": { | |
"name": "dap", | |
"hparams": { | |
"n_speaker_dim": 16, | |
"bottleneck_hparams": { | |
"in_dim": 512, | |
"reduction_factor": 16, | |
"norm": "weightnorm", | |
"non_linearity": "relu" | |
}, | |
"take_log_of_input": false, | |
"use_transformer": false, | |
"arch_hparams": { | |
"out_dim": 1, | |
"n_layers": 2, | |
"n_channels": 256, | |
"kernel_size": 11, | |
"p_dropout": 0.5, | |
"in_dim": 48 | |
} | |
} | |
}, | |
"energy_model_config": { | |
"name": "dap", | |
"hparams": { | |
"n_speaker_dim": 16, | |
"bottleneck_hparams": { | |
"in_dim": 512, | |
"reduction_factor": 16, | |
"norm": "weightnorm", | |
"non_linearity": "relu" | |
}, | |
"take_log_of_input": false, | |
"use_transformer": false, | |
"arch_hparams": { | |
"out_dim": 1, | |
"n_layers": 2, | |
"n_channels": 256, | |
"kernel_size": 3, | |
"p_dropout": 0.25, | |
"in_dim": 48 | |
} | |
} | |
}, | |
"v_model_config": { | |
"name": "dap", | |
"hparams": { | |
"n_speaker_dim": 16, | |
"take_log_of_input": false, | |
"bottleneck_hparams": { | |
"in_dim": 512, | |
"reduction_factor": 16, | |
"norm": "weightnorm", | |
"non_linearity": "relu" | |
}, | |
"arch_hparams": { | |
"out_dim": 1, | |
"n_layers": 2, | |
"n_channels": 256, | |
"kernel_size": 3, | |
"p_dropout": 0.5, | |
"lstm_type": "", | |
"use_linear": 1, | |
"in_dim": 48 | |
} | |
} | |
} | |
} | |
} | |