shethjenil commited on
Commit
9058ca7
·
verified ·
1 Parent(s): b1e4512

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. as_fastpitch_best_model.pth +3 -0
  2. as_fastpitch_config.json +215 -0
  3. as_fastpitch_speakers.pth +3 -0
  4. as_hifigan_best_model.pth +3 -0
  5. as_hifigan_config.json +189 -0
  6. bn_fastpitch_best_model.pth +3 -0
  7. bn_fastpitch_config.json +215 -0
  8. bn_fastpitch_speakers.pth +3 -0
  9. bn_hifigan_best_model.pth +3 -0
  10. bn_hifigan_config.json +189 -0
  11. brx_fastpitch_best_model.pth +3 -0
  12. brx_fastpitch_config.json +215 -0
  13. brx_fastpitch_speakers.pth +3 -0
  14. brx_hifigan_best_model.pth +3 -0
  15. brx_hifigan_config.json +189 -0
  16. en+hi_fastpitch_best_model.pth +3 -0
  17. en+hi_fastpitch_config.json +212 -0
  18. en+hi_fastpitch_speakers.pth +3 -0
  19. en+hi_hifigan_best_model.pth +3 -0
  20. en+hi_hifigan_config.json +189 -0
  21. en_fastpitch_best_model.pth +3 -0
  22. en_fastpitch_config.json +210 -0
  23. en_fastpitch_speakers.pth +3 -0
  24. en_hifigan_best_model.pth +3 -0
  25. en_hifigan_config.json +189 -0
  26. gu_fastpitch_best_model.pth +3 -0
  27. gu_fastpitch_config.json +215 -0
  28. gu_fastpitch_speakers.pth +3 -0
  29. gu_hifigan_best_model.pth +3 -0
  30. gu_hifigan_config.json +189 -0
  31. hi_fastpitch_best_model.pth +3 -0
  32. hi_fastpitch_config.json +215 -0
  33. hi_fastpitch_speakers.pth +3 -0
  34. hi_hifigan_best_model.pth +3 -0
  35. hi_hifigan_config.json +189 -0
  36. kn_fastpitch_best_model.pth +3 -0
  37. kn_fastpitch_config.json +210 -0
  38. kn_fastpitch_speakers.pth +3 -0
  39. kn_hifigan_best_model.pth +3 -0
  40. kn_hifigan_config.json +189 -0
  41. ml_fastpitch_best_model.pth +3 -0
  42. ml_fastpitch_config.json +210 -0
  43. ml_fastpitch_speakers.pth +3 -0
  44. ml_hifigan_best_model.pth +3 -0
  45. ml_hifigan_config.json +189 -0
  46. mni_fastpitch_best_model.pth +3 -0
  47. mni_fastpitch_config.json +215 -0
  48. mni_fastpitch_speakers.pth +3 -0
  49. mni_hifigan_best_model.pth +3 -0
  50. mni_hifigan_config.json +189 -0
as_fastpitch_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a69934f58bc7c5671f48b62f8d92108af66c008abadbfa7a1bf0d1962c252f7c
3
+ size 637368985
as_fastpitch_config.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output_indic_fastpitch/as",
3
+ "logger_uri": null,
4
+ "run_name": "as_fastpitch_indictts_all_align_off",
5
+ "project_name": "indic-fastpitch-stage2",
6
+ "run_description": "align_off",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": 10000,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:54321",
24
+ "mixed_precision": true,
25
+ "epochs": 1000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": 5.0,
29
+ "scheduler_after_epoch": true,
30
+ "lr": 0.0001,
31
+ "optimizer": "Adam",
32
+ "optimizer_params": {
33
+ "betas": [
34
+ 0.9,
35
+ 0.998
36
+ ],
37
+ "weight_decay": 1e-06
38
+ },
39
+ "lr_scheduler": "NoamLR",
40
+ "lr_scheduler_params": {
41
+ "warmup_steps": 4000
42
+ },
43
+ "lr_scheduler_aligner": "NoamLR",
44
+ "lr_scheduler_aligner_params": {
45
+ "warmup_steps": 4000
46
+ },
47
+ "use_grad_scaler": false,
48
+ "cudnn_enable": true,
49
+ "cudnn_deterministic": false,
50
+ "cudnn_benchmark": false,
51
+ "training_seed": 54321,
52
+ "model": "fast_pitch",
53
+ "num_loader_workers": 0,
54
+ "num_eval_loader_workers": 0,
55
+ "use_noise_augment": false,
56
+ "audio": {
57
+ "fft_size": 1024,
58
+ "win_length": 1024,
59
+ "hop_length": 256,
60
+ "frame_shift_ms": null,
61
+ "frame_length_ms": null,
62
+ "stft_pad_mode": "reflect",
63
+ "sample_rate": 22050,
64
+ "resample": false,
65
+ "preemphasis": 0.0,
66
+ "ref_level_db": 20,
67
+ "do_sound_norm": false,
68
+ "log_func": "np.log",
69
+ "do_trim_silence": true,
70
+ "trim_db": 60.0,
71
+ "do_rms_norm": false,
72
+ "db_level": null,
73
+ "power": 1.5,
74
+ "griffin_lim_iters": 60,
75
+ "num_mels": 80,
76
+ "mel_fmin": 0.0,
77
+ "mel_fmax": 8000,
78
+ "spec_gain": 1.0,
79
+ "do_amp_to_db_linear": true,
80
+ "do_amp_to_db_mel": true,
81
+ "pitch_fmax": 640.0,
82
+ "pitch_fmin": 0.0,
83
+ "signal_norm": false,
84
+ "min_level_db": -100,
85
+ "symmetric_norm": true,
86
+ "max_norm": 4.0,
87
+ "clip_norm": true,
88
+ "stats_path": null
89
+ },
90
+ "use_phonemes": false,
91
+ "phonemizer": null,
92
+ "phoneme_language": "en-us",
93
+ "compute_input_seq_cache": false,
94
+ "text_cleaner": "multilingual_cleaners",
95
+ "enable_eos_bos_chars": false,
96
+ "test_sentences_file": "",
97
+ "phoneme_cache_path": "output_indic_fastpitch/as/phoneme_cache",
98
+ "characters": {
99
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
100
+ "vocab_dict": null,
101
+ "pad": "<PAD>",
102
+ "eos": "<EOS>",
103
+ "bos": "<BOS>",
104
+ "blank": "<BLNK>",
105
+ "characters": " ',:;\u02bc\u0981\u0982\u0983\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9\u09bc\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd\u09ce\u09dc\u09dd\u09df\u09f0\u09f1\u200c\u200d",
106
+ "punctuations": "!\u00a1'(),-.:;\u00bf? ",
107
+ "phonemes": null,
108
+ "is_unique": true,
109
+ "is_sorted": true
110
+ },
111
+ "add_blank": false,
112
+ "batch_group_size": 0,
113
+ "loss_masking": null,
114
+ "sort_by_audio_len": true,
115
+ "min_audio_len": 1,
116
+ "max_audio_len": 441000,
117
+ "min_text_len": 1,
118
+ "max_text_len": 400,
119
+ "compute_f0": true,
120
+ "compute_linear_spec": false,
121
+ "precompute_num_workers": 0,
122
+ "start_by_longest": false,
123
+ "datasets": [
124
+ {
125
+ "name": "indictts",
126
+ "path": "/home/ttsteam/datasets/indictts/as",
127
+ "meta_file_train": "metadata_train.csv",
128
+ "ignored_speakers": null,
129
+ "language": "as",
130
+ "meta_file_val": "metadata_test.csv",
131
+ "meta_file_attn_mask": ""
132
+ }
133
+ ],
134
+ "test_sentences": [
135
+ "\u09a6\u09c7\u0989\u09a4\u09be\u0987 \u0989\u0987\u09b2\u09a4 \u09b8\u09cd\u09aa\u09b7\u09cd\u099f\u0995\u09c8 \u09b8\u09c7\u0987\u0996\u09bf\u09a8\u09bf \u09ae\u09cb\u09f0 \u09a8\u09be\u09ae\u09a4 \u09b2\u09bf\u0996\u09bf \u09a6\u09bf \u0997\u09c8\u099b\u09c7",
136
+ "\u0997\u09a4\u09bf\u0995\u09c7 \u09b6\u09bf\u0995\u09cd\u09b7\u09be\u09f0 \u09ac\u09be\u09ac\u09c7\u0993 \u098f\u09a8\u09c7 \u098f\u0995 \u09aa\u09c2\u09f0\u09cd\u09ac \u09aa\u09cd\u09f0\u09b8\u09cd\u09a4\u09c1\u09a4 \u09aa\u09f0\u09bf\u200c\u09f1\u09c7\u09b6 \u098f\u099f\u09be\u09a4"
137
+ ],
138
+ "eval_split_max_size": null,
139
+ "eval_split_size": 0.01,
140
+ "use_speaker_weighted_sampler": false,
141
+ "speaker_weighted_sampler_alpha": 1.0,
142
+ "use_language_weighted_sampler": false,
143
+ "language_weighted_sampler_alpha": 1.0,
144
+ "use_length_weighted_sampler": false,
145
+ "length_weighted_sampler_alpha": 1.0,
146
+ "base_model": "forward_tts",
147
+ "model_args": {
148
+ "num_chars": 87,
149
+ "out_channels": 80,
150
+ "hidden_channels": 512,
151
+ "use_aligner": true,
152
+ "use_pitch": true,
153
+ "pitch_predictor_hidden_channels": 256,
154
+ "pitch_predictor_kernel_size": 3,
155
+ "pitch_predictor_dropout_p": 0.1,
156
+ "pitch_embedding_kernel_size": 3,
157
+ "duration_predictor_hidden_channels": 256,
158
+ "duration_predictor_kernel_size": 3,
159
+ "duration_predictor_dropout_p": 0.1,
160
+ "positional_encoding": true,
161
+ "poisitonal_encoding_use_scale": true,
162
+ "length_scale": 1,
163
+ "encoder_type": "fftransformer",
164
+ "encoder_params": {
165
+ "hidden_channels_ffn": 1024,
166
+ "num_heads": 1,
167
+ "num_layers": 6,
168
+ "dropout_p": 0.1
169
+ },
170
+ "decoder_type": "fftransformer",
171
+ "decoder_params": {
172
+ "hidden_channels_ffn": 1024,
173
+ "num_heads": 1,
174
+ "num_layers": 6,
175
+ "dropout_p": 0.1
176
+ },
177
+ "detach_duration_predictor": false,
178
+ "max_duration": 75,
179
+ "num_speakers": 2,
180
+ "use_speaker_embedding": true,
181
+ "speakers_file": "models/v1/as/fastpitch/speakers.pth",
182
+ "use_d_vector_file": false,
183
+ "d_vector_dim": 512,
184
+ "d_vector_file": null,
185
+ "use_speaker_encoder_as_loss": false,
186
+ "speaker_encoder_config_path": "",
187
+ "speaker_encoder_model_path": "",
188
+ "vocoder_path": null,
189
+ "vocoder_config_path": null,
190
+ "use_separate_optimizers": false
191
+ },
192
+ "return_wav": false,
193
+ "num_speakers": 2,
194
+ "speakers_file": "models/v1/as/fastpitch/speakers.pth",
195
+ "use_speaker_embedding": true,
196
+ "use_d_vector_file": false,
197
+ "d_vector_file": "",
198
+ "d_vector_dim": 512,
199
+ "spec_loss_type": "mse",
200
+ "duration_loss_type": "mse",
201
+ "use_ssim_loss": false,
202
+ "ssim_loss_alpha": 1.0,
203
+ "spec_loss_alpha": 1.0,
204
+ "aligner_loss_alpha": 1.0,
205
+ "pitch_loss_alpha": 0.1,
206
+ "dur_loss_alpha": 0.1,
207
+ "binary_align_loss_alpha": 0.1,
208
+ "spk_encoder_loss_alpha": 0.1,
209
+ "binary_loss_warmup_epochs": 150,
210
+ "aligner_epochs": 0,
211
+ "min_seq_len": 13,
212
+ "max_seq_len": 500000,
213
+ "r": 1,
214
+ "f0_cache_path": "output_indic_fastpitch/as/f0_cache"
215
+ }
as_fastpitch_speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea
3
+ size 431
as_hifigan_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:891964d628c865a9a250b6dee326bbf0a1fdb4b66a2d23f513b1bb1d0e465e0e
3
+ size 1016384316
as_hifigan_config.json ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "indic_vocoders",
3
+ "logger_uri": null,
4
+ "run_name": "as_hifigan_all",
5
+ "project_name": "indic-vocoders",
6
+ "run_description": "None",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": null,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": "loss_1",
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:10004",
24
+ "mixed_precision": true,
25
+ "epochs": 5000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": [
29
+ 5,
30
+ 5
31
+ ],
32
+ "scheduler_after_epoch": true,
33
+ "lr": 0.0001,
34
+ "optimizer": "AdamW",
35
+ "optimizer_params": {
36
+ "betas": [
37
+ 0.8,
38
+ 0.99
39
+ ],
40
+ "weight_decay": 0.0
41
+ },
42
+ "lr_scheduler": null,
43
+ "lr_scheduler_params": null,
44
+ "use_grad_scaler": false,
45
+ "cudnn_enable": true,
46
+ "cudnn_deterministic": false,
47
+ "cudnn_benchmark": false,
48
+ "training_seed": 54321,
49
+ "model": "hifigan",
50
+ "num_loader_workers": 8,
51
+ "num_eval_loader_workers": 8,
52
+ "use_noise_augment": true,
53
+ "audio": {
54
+ "fft_size": 1024,
55
+ "win_length": 1024,
56
+ "hop_length": 256,
57
+ "frame_shift_ms": null,
58
+ "frame_length_ms": null,
59
+ "stft_pad_mode": "reflect",
60
+ "sample_rate": 22050,
61
+ "resample": false,
62
+ "preemphasis": 0.0,
63
+ "ref_level_db": 20,
64
+ "do_sound_norm": false,
65
+ "log_func": "np.log",
66
+ "do_trim_silence": true,
67
+ "trim_db": 60.0,
68
+ "do_rms_norm": false,
69
+ "db_level": null,
70
+ "power": 1.5,
71
+ "griffin_lim_iters": 60,
72
+ "num_mels": 80,
73
+ "mel_fmin": 0.0,
74
+ "mel_fmax": 8000,
75
+ "spec_gain": 1.0,
76
+ "do_amp_to_db_linear": true,
77
+ "do_amp_to_db_mel": true,
78
+ "pitch_fmax": 640.0,
79
+ "pitch_fmin": 0.0,
80
+ "signal_norm": false,
81
+ "min_level_db": -100,
82
+ "symmetric_norm": true,
83
+ "max_norm": 4.0,
84
+ "clip_norm": true,
85
+ "stats_path": null
86
+ },
87
+ "eval_split_size": 10,
88
+ "data_path": "../../datasets/indictts/as",
89
+ "feature_path": null,
90
+ "seq_len": 8192,
91
+ "pad_short": 2000,
92
+ "conv_pad": 0,
93
+ "use_cache": false,
94
+ "wd": 1e-06,
95
+ "use_stft_loss": false,
96
+ "use_subband_stft_loss": false,
97
+ "use_mse_gan_loss": true,
98
+ "use_hinge_gan_loss": false,
99
+ "use_feat_match_loss": true,
100
+ "use_l1_spec_loss": true,
101
+ "stft_loss_weight": 0,
102
+ "subband_stft_loss_weight": 0,
103
+ "mse_G_loss_weight": 1,
104
+ "hinge_G_loss_weight": 0,
105
+ "feat_match_loss_weight": 108,
106
+ "l1_spec_loss_weight": 45,
107
+ "stft_loss_params": {
108
+ "n_ffts": [
109
+ 1024,
110
+ 2048,
111
+ 512
112
+ ],
113
+ "hop_lengths": [
114
+ 120,
115
+ 240,
116
+ 50
117
+ ],
118
+ "win_lengths": [
119
+ 600,
120
+ 1200,
121
+ 240
122
+ ]
123
+ },
124
+ "l1_spec_loss_params": {
125
+ "use_mel": true,
126
+ "sample_rate": 22050,
127
+ "n_fft": 1024,
128
+ "hop_length": 256,
129
+ "win_length": 1024,
130
+ "n_mels": 80,
131
+ "mel_fmin": 0.0,
132
+ "mel_fmax": null
133
+ },
134
+ "lr_gen": 0.0001,
135
+ "lr_disc": 0.0001,
136
+ "lr_scheduler_gen": "ExponentialLR",
137
+ "lr_scheduler_gen_params": {
138
+ "gamma": 0.999,
139
+ "last_epoch": -1
140
+ },
141
+ "lr_scheduler_disc": "ExponentialLR",
142
+ "lr_scheduler_disc_params": {
143
+ "gamma": 0.999,
144
+ "last_epoch": -1
145
+ },
146
+ "use_pqmf": false,
147
+ "diff_samples_for_G_and_D": false,
148
+ "discriminator_model": "hifigan_discriminator",
149
+ "generator_model": "hifigan_generator",
150
+ "generator_model_params": {
151
+ "upsample_factors": [
152
+ 8,
153
+ 8,
154
+ 2,
155
+ 2
156
+ ],
157
+ "upsample_kernel_sizes": [
158
+ 16,
159
+ 16,
160
+ 4,
161
+ 4
162
+ ],
163
+ "upsample_initial_channel": 512,
164
+ "resblock_kernel_sizes": [
165
+ 3,
166
+ 7,
167
+ 11
168
+ ],
169
+ "resblock_dilation_sizes": [
170
+ [
171
+ 1,
172
+ 3,
173
+ 5
174
+ ],
175
+ [
176
+ 1,
177
+ 3,
178
+ 5
179
+ ],
180
+ [
181
+ 1,
182
+ 3,
183
+ 5
184
+ ]
185
+ ],
186
+ "resblock_type": "1"
187
+ },
188
+ "github_branch": "* main"
189
+ }
bn_fastpitch_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e57cb04e7500edc68e040d0f407bf0a93a23a7d5a9011a49adc7a17273a936d
3
+ size 637449049
bn_fastpitch_config.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output_indic_fastpitch/bn",
3
+ "logger_uri": null,
4
+ "run_name": "bn_fastpitch_indictts_all_align_off",
5
+ "project_name": "indic-fastpitch-stage2",
6
+ "run_description": "align_off",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": 10000,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:54321",
24
+ "mixed_precision": true,
25
+ "epochs": 1000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": 5.0,
29
+ "scheduler_after_epoch": true,
30
+ "lr": 0.0001,
31
+ "optimizer": "Adam",
32
+ "optimizer_params": {
33
+ "betas": [
34
+ 0.9,
35
+ 0.998
36
+ ],
37
+ "weight_decay": 1e-06
38
+ },
39
+ "lr_scheduler": "NoamLR",
40
+ "lr_scheduler_params": {
41
+ "warmup_steps": 4000
42
+ },
43
+ "lr_scheduler_aligner": "NoamLR",
44
+ "lr_scheduler_aligner_params": {
45
+ "warmup_steps": 4000
46
+ },
47
+ "use_grad_scaler": false,
48
+ "cudnn_enable": true,
49
+ "cudnn_deterministic": false,
50
+ "cudnn_benchmark": false,
51
+ "training_seed": 54321,
52
+ "model": "fast_pitch",
53
+ "num_loader_workers": 0,
54
+ "num_eval_loader_workers": 0,
55
+ "use_noise_augment": false,
56
+ "audio": {
57
+ "fft_size": 1024,
58
+ "win_length": 1024,
59
+ "hop_length": 256,
60
+ "frame_shift_ms": null,
61
+ "frame_length_ms": null,
62
+ "stft_pad_mode": "reflect",
63
+ "sample_rate": 22050,
64
+ "resample": false,
65
+ "preemphasis": 0.0,
66
+ "ref_level_db": 20,
67
+ "do_sound_norm": false,
68
+ "log_func": "np.log",
69
+ "do_trim_silence": true,
70
+ "trim_db": 60.0,
71
+ "do_rms_norm": false,
72
+ "db_level": null,
73
+ "power": 1.5,
74
+ "griffin_lim_iters": 60,
75
+ "num_mels": 80,
76
+ "mel_fmin": 0.0,
77
+ "mel_fmax": 8000,
78
+ "spec_gain": 1.0,
79
+ "do_amp_to_db_linear": true,
80
+ "do_amp_to_db_mel": true,
81
+ "pitch_fmax": 640.0,
82
+ "pitch_fmin": 0.0,
83
+ "signal_norm": false,
84
+ "min_level_db": -100,
85
+ "symmetric_norm": true,
86
+ "max_norm": 4.0,
87
+ "clip_norm": true,
88
+ "stats_path": null
89
+ },
90
+ "use_phonemes": false,
91
+ "phonemizer": null,
92
+ "phoneme_language": "en-us",
93
+ "compute_input_seq_cache": false,
94
+ "text_cleaner": "multilingual_cleaners",
95
+ "enable_eos_bos_chars": false,
96
+ "test_sentences_file": "",
97
+ "phoneme_cache_path": "output_indic_fastpitch/bn/phoneme_cache",
98
+ "characters": {
99
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
100
+ "vocab_dict": null,
101
+ "pad": "<PAD>",
102
+ "eos": "<EOS>",
103
+ "bos": "<BOS>",
104
+ "blank": "<BLNK>",
105
+ "characters": " !\",-.?\u0964\u0981\u0982\u0983\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9\u09bc\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd\u09ce\u09d7\u09dc\u09dd\u09df\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ee\u200c\u200d\u2014\u2018\u2019",
106
+ "punctuations": "!\u00a1'(),-.:;\u00bf? ",
107
+ "phonemes": null,
108
+ "is_unique": true,
109
+ "is_sorted": true
110
+ },
111
+ "add_blank": false,
112
+ "batch_group_size": 0,
113
+ "loss_masking": null,
114
+ "sort_by_audio_len": true,
115
+ "min_audio_len": 1,
116
+ "max_audio_len": 441000,
117
+ "min_text_len": 1,
118
+ "max_text_len": 400,
119
+ "compute_f0": true,
120
+ "compute_linear_spec": false,
121
+ "precompute_num_workers": 0,
122
+ "start_by_longest": false,
123
+ "datasets": [
124
+ {
125
+ "name": "indictts",
126
+ "path": "/home/ttsteam/datasets/indictts/bn",
127
+ "meta_file_train": "metadata_train.csv",
128
+ "ignored_speakers": null,
129
+ "language": "bn",
130
+ "meta_file_val": "metadata_test.csv",
131
+ "meta_file_attn_mask": ""
132
+ }
133
+ ],
134
+ "test_sentences": [
135
+ "\u09b2\u09cb\u09a1\u09b6\u09c7\u09a1\u09bf\u0982\u09df\u09c7\u09b0 \u0995\u09b2\u09cd\u09af\u09be\u09a3\u09c7 \u09aa\u09c1\u099c\u09cb\u09b0 \u09a6\u09c1\u09b8\u09aa\u09cd\u09a4\u09be\u09b9 \u0986\u0997\u09c7 \u0995\u09c7\u09a8\u09be\u0995\u09be\u099f\u09be\u09b0 \u09ae\u09be\u09b9\u09c7\u09a8\u09cd\u09a6\u09cd\u09b0\u0995\u09cd\u09b7\u09a3\u09c7, \u09a6\u09cb\u0995\u09be\u09a8\u09c7 \u09b6\u09cb\u09ad\u09be \u09aa\u09be\u099a\u09cd\u099b\u09c7, \u09ae\u09cb\u09ae\u09ac\u09be\u09a4\u09bf",
136
+ "\u098f\u0995 \u099a\u09a8\u09cd\u09a6\u09b0\u09be \u09a8\u09bf\u09b0\u09cd\u09a6\u09cb\u09b7 \u09b9\u0987\u09df\u09be\u0993, \u0986\u0987\u09a8\u09c7\u09b0 \u0986\u09aa\u09be\u09a4 \u09a8\u09bf\u09b6\u09cd\u099b\u09bf\u09a6\u09cd\u09b0 \u099c\u09be\u09b2\u09c7 \u09aa\u09dc\u09bf\u09df\u09be \u09aa\u09cd\u09b0\u09be\u09a3 \u09a6\u09bf\u09df\u09be\u099b\u09bf\u09b2"
137
+ ],
138
+ "eval_split_max_size": null,
139
+ "eval_split_size": 0.01,
140
+ "use_speaker_weighted_sampler": false,
141
+ "speaker_weighted_sampler_alpha": 1.0,
142
+ "use_language_weighted_sampler": false,
143
+ "language_weighted_sampler_alpha": 1.0,
144
+ "use_length_weighted_sampler": false,
145
+ "length_weighted_sampler_alpha": 1.0,
146
+ "base_model": "forward_tts",
147
+ "model_args": {
148
+ "num_chars": 99,
149
+ "out_channels": 80,
150
+ "hidden_channels": 512,
151
+ "use_aligner": true,
152
+ "use_pitch": true,
153
+ "pitch_predictor_hidden_channels": 256,
154
+ "pitch_predictor_kernel_size": 3,
155
+ "pitch_predictor_dropout_p": 0.1,
156
+ "pitch_embedding_kernel_size": 3,
157
+ "duration_predictor_hidden_channels": 256,
158
+ "duration_predictor_kernel_size": 3,
159
+ "duration_predictor_dropout_p": 0.1,
160
+ "positional_encoding": true,
161
+ "poisitonal_encoding_use_scale": true,
162
+ "length_scale": 1,
163
+ "encoder_type": "fftransformer",
164
+ "encoder_params": {
165
+ "hidden_channels_ffn": 1024,
166
+ "num_heads": 1,
167
+ "num_layers": 6,
168
+ "dropout_p": 0.1
169
+ },
170
+ "decoder_type": "fftransformer",
171
+ "decoder_params": {
172
+ "hidden_channels_ffn": 1024,
173
+ "num_heads": 1,
174
+ "num_layers": 6,
175
+ "dropout_p": 0.1
176
+ },
177
+ "detach_duration_predictor": false,
178
+ "max_duration": 75,
179
+ "num_speakers": 3,
180
+ "use_speaker_embedding": true,
181
+ "speakers_file": "models/v1/bn/fastpitch/speakers.pth",
182
+ "use_d_vector_file": false,
183
+ "d_vector_dim": 512,
184
+ "d_vector_file": null,
185
+ "use_speaker_encoder_as_loss": false,
186
+ "speaker_encoder_config_path": "",
187
+ "speaker_encoder_model_path": "",
188
+ "vocoder_path": null,
189
+ "vocoder_config_path": null,
190
+ "use_separate_optimizers": false
191
+ },
192
+ "return_wav": false,
193
+ "num_speakers": 3,
194
+ "speakers_file": "models/v1/bn/fastpitch/speakers.pth",
195
+ "use_speaker_embedding": true,
196
+ "use_d_vector_file": false,
197
+ "d_vector_file": "",
198
+ "d_vector_dim": 512,
199
+ "spec_loss_type": "mse",
200
+ "duration_loss_type": "mse",
201
+ "use_ssim_loss": false,
202
+ "ssim_loss_alpha": 1.0,
203
+ "spec_loss_alpha": 1.0,
204
+ "aligner_loss_alpha": 1.0,
205
+ "pitch_loss_alpha": 0.1,
206
+ "dur_loss_alpha": 0.1,
207
+ "binary_align_loss_alpha": 0.1,
208
+ "spk_encoder_loss_alpha": 0.1,
209
+ "binary_loss_warmup_epochs": 150,
210
+ "aligner_epochs": 0,
211
+ "min_seq_len": 13,
212
+ "max_seq_len": 500000,
213
+ "r": 1,
214
+ "f0_cache_path": "output_indic_fastpitch/bn/f0_cache"
215
+ }
bn_fastpitch_speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f10ffc1e9e515dcdaff2a8076d54df717bc56b70fd63546f0bcbe5b09babac1c
3
+ size 431
bn_hifigan_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fbf1016b24f30f84fe4917d20ce2b14af4972b42c70f6e9f6db108cce53c37f
3
+ size 1016383548
bn_hifigan_config.json ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "indic_vocoders",
3
+ "logger_uri": null,
4
+ "run_name": "bn_hifigan_all",
5
+ "project_name": "indic-vocoders",
6
+ "run_description": "None",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": null,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": "loss_1",
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:10005",
24
+ "mixed_precision": true,
25
+ "epochs": 5000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": [
29
+ 5,
30
+ 5
31
+ ],
32
+ "scheduler_after_epoch": true,
33
+ "lr": 0.0001,
34
+ "optimizer": "AdamW",
35
+ "optimizer_params": {
36
+ "betas": [
37
+ 0.8,
38
+ 0.99
39
+ ],
40
+ "weight_decay": 0.0
41
+ },
42
+ "lr_scheduler": null,
43
+ "lr_scheduler_params": null,
44
+ "use_grad_scaler": false,
45
+ "cudnn_enable": true,
46
+ "cudnn_deterministic": false,
47
+ "cudnn_benchmark": false,
48
+ "training_seed": 54321,
49
+ "model": "hifigan",
50
+ "num_loader_workers": 8,
51
+ "num_eval_loader_workers": 8,
52
+ "use_noise_augment": true,
53
+ "audio": {
54
+ "fft_size": 1024,
55
+ "win_length": 1024,
56
+ "hop_length": 256,
57
+ "frame_shift_ms": null,
58
+ "frame_length_ms": null,
59
+ "stft_pad_mode": "reflect",
60
+ "sample_rate": 22050,
61
+ "resample": false,
62
+ "preemphasis": 0.0,
63
+ "ref_level_db": 20,
64
+ "do_sound_norm": false,
65
+ "log_func": "np.log",
66
+ "do_trim_silence": true,
67
+ "trim_db": 60.0,
68
+ "do_rms_norm": false,
69
+ "db_level": null,
70
+ "power": 1.5,
71
+ "griffin_lim_iters": 60,
72
+ "num_mels": 80,
73
+ "mel_fmin": 0.0,
74
+ "mel_fmax": 8000,
75
+ "spec_gain": 1.0,
76
+ "do_amp_to_db_linear": true,
77
+ "do_amp_to_db_mel": true,
78
+ "pitch_fmax": 640.0,
79
+ "pitch_fmin": 0.0,
80
+ "signal_norm": false,
81
+ "min_level_db": -100,
82
+ "symmetric_norm": true,
83
+ "max_norm": 4.0,
84
+ "clip_norm": true,
85
+ "stats_path": null
86
+ },
87
+ "eval_split_size": 10,
88
+ "data_path": "../../datasets/indictts/bn",
89
+ "feature_path": null,
90
+ "seq_len": 8192,
91
+ "pad_short": 2000,
92
+ "conv_pad": 0,
93
+ "use_cache": false,
94
+ "wd": 1e-06,
95
+ "use_stft_loss": false,
96
+ "use_subband_stft_loss": false,
97
+ "use_mse_gan_loss": true,
98
+ "use_hinge_gan_loss": false,
99
+ "use_feat_match_loss": true,
100
+ "use_l1_spec_loss": true,
101
+ "stft_loss_weight": 0,
102
+ "subband_stft_loss_weight": 0,
103
+ "mse_G_loss_weight": 1,
104
+ "hinge_G_loss_weight": 0,
105
+ "feat_match_loss_weight": 108,
106
+ "l1_spec_loss_weight": 45,
107
+ "stft_loss_params": {
108
+ "n_ffts": [
109
+ 1024,
110
+ 2048,
111
+ 512
112
+ ],
113
+ "hop_lengths": [
114
+ 120,
115
+ 240,
116
+ 50
117
+ ],
118
+ "win_lengths": [
119
+ 600,
120
+ 1200,
121
+ 240
122
+ ]
123
+ },
124
+ "l1_spec_loss_params": {
125
+ "use_mel": true,
126
+ "sample_rate": 22050,
127
+ "n_fft": 1024,
128
+ "hop_length": 256,
129
+ "win_length": 1024,
130
+ "n_mels": 80,
131
+ "mel_fmin": 0.0,
132
+ "mel_fmax": null
133
+ },
134
+ "lr_gen": 0.0001,
135
+ "lr_disc": 0.0001,
136
+ "lr_scheduler_gen": "ExponentialLR",
137
+ "lr_scheduler_gen_params": {
138
+ "gamma": 0.999,
139
+ "last_epoch": -1
140
+ },
141
+ "lr_scheduler_disc": "ExponentialLR",
142
+ "lr_scheduler_disc_params": {
143
+ "gamma": 0.999,
144
+ "last_epoch": -1
145
+ },
146
+ "use_pqmf": false,
147
+ "diff_samples_for_G_and_D": false,
148
+ "discriminator_model": "hifigan_discriminator",
149
+ "generator_model": "hifigan_generator",
150
+ "generator_model_params": {
151
+ "upsample_factors": [
152
+ 8,
153
+ 8,
154
+ 2,
155
+ 2
156
+ ],
157
+ "upsample_kernel_sizes": [
158
+ 16,
159
+ 16,
160
+ 4,
161
+ 4
162
+ ],
163
+ "upsample_initial_channel": 512,
164
+ "resblock_kernel_sizes": [
165
+ 3,
166
+ 7,
167
+ 11
168
+ ],
169
+ "resblock_dilation_sizes": [
170
+ [
171
+ 1,
172
+ 3,
173
+ 5
174
+ ],
175
+ [
176
+ 1,
177
+ 3,
178
+ 5
179
+ ],
180
+ [
181
+ 1,
182
+ 3,
183
+ 5
184
+ ]
185
+ ],
186
+ "resblock_type": "1"
187
+ },
188
+ "github_branch": "* main"
189
+ }
brx_fastpitch_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce2af698a57b3b1ac33e4903efe8e0a65ffd21deb183b8e26bb04c3ba50dbebb
3
+ size 637436697
brx_fastpitch_config.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output_indic_fastpitch/brx",
3
+ "logger_uri": null,
4
+ "run_name": "brx_fastpitch_indictts_all_align_off",
5
+ "project_name": "indic-fastpitch-stage2",
6
+ "run_description": "align_off",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": 10000,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:54321",
24
+ "mixed_precision": true,
25
+ "epochs": 1000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": 5.0,
29
+ "scheduler_after_epoch": true,
30
+ "lr": 0.0001,
31
+ "optimizer": "Adam",
32
+ "optimizer_params": {
33
+ "betas": [
34
+ 0.9,
35
+ 0.998
36
+ ],
37
+ "weight_decay": 1e-06
38
+ },
39
+ "lr_scheduler": "NoamLR",
40
+ "lr_scheduler_params": {
41
+ "warmup_steps": 4000
42
+ },
43
+ "lr_scheduler_aligner": "NoamLR",
44
+ "lr_scheduler_aligner_params": {
45
+ "warmup_steps": 4000
46
+ },
47
+ "use_grad_scaler": false,
48
+ "cudnn_enable": true,
49
+ "cudnn_deterministic": false,
50
+ "cudnn_benchmark": false,
51
+ "training_seed": 54321,
52
+ "model": "fast_pitch",
53
+ "num_loader_workers": 0,
54
+ "num_eval_loader_workers": 0,
55
+ "use_noise_augment": false,
56
+ "audio": {
57
+ "fft_size": 1024,
58
+ "win_length": 1024,
59
+ "hop_length": 256,
60
+ "frame_shift_ms": null,
61
+ "frame_length_ms": null,
62
+ "stft_pad_mode": "reflect",
63
+ "sample_rate": 22050,
64
+ "resample": false,
65
+ "preemphasis": 0.0,
66
+ "ref_level_db": 20,
67
+ "do_sound_norm": false,
68
+ "log_func": "np.log",
69
+ "do_trim_silence": true,
70
+ "trim_db": 60.0,
71
+ "do_rms_norm": false,
72
+ "db_level": null,
73
+ "power": 1.5,
74
+ "griffin_lim_iters": 60,
75
+ "num_mels": 80,
76
+ "mel_fmin": 0.0,
77
+ "mel_fmax": 8000,
78
+ "spec_gain": 1.0,
79
+ "do_amp_to_db_linear": true,
80
+ "do_amp_to_db_mel": true,
81
+ "pitch_fmax": 640.0,
82
+ "pitch_fmin": 0.0,
83
+ "signal_norm": false,
84
+ "min_level_db": -100,
85
+ "symmetric_norm": true,
86
+ "max_norm": 4.0,
87
+ "clip_norm": true,
88
+ "stats_path": null
89
+ },
90
+ "use_phonemes": false,
91
+ "phonemizer": null,
92
+ "phoneme_language": "en-us",
93
+ "compute_input_seq_cache": false,
94
+ "text_cleaner": "multilingual_cleaners",
95
+ "enable_eos_bos_chars": false,
96
+ "test_sentences_file": "",
97
+ "phoneme_cache_path": "output_indic_fastpitch/brx/phoneme_cache",
98
+ "characters": {
99
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
100
+ "vocab_dict": null,
101
+ "pad": "<PAD>",
102
+ "eos": "<EOS>",
103
+ "bos": "<BOS>",
104
+ "blank": "<BLNK>",
105
+ "characters": " !\"',-./12:;?\u00bd\u02bc\u0902\u0903\u0905\u0906\u0907\u0908\u0909\u090f\u0910\u0913\u0914\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0923\u0924\u0925\u0926\u0927\u0928\u0929\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u093c\u093e\u093f\u0940\u0941\u0942\u0943\u0945\u0947\u0948\u0949\u094b\u094c\u094d\u095b\u095c\u095f\u0964\u097d\u200d\u2013\u2018\u201c\u201d",
106
+ "punctuations": "!\u00a1'(),-.:;\u00bf? ",
107
+ "phonemes": null,
108
+ "is_unique": true,
109
+ "is_sorted": true
110
+ },
111
+ "add_blank": false,
112
+ "batch_group_size": 0,
113
+ "loss_masking": null,
114
+ "sort_by_audio_len": true,
115
+ "min_audio_len": 1,
116
+ "max_audio_len": 441000,
117
+ "min_text_len": 1,
118
+ "max_text_len": 400,
119
+ "compute_f0": true,
120
+ "compute_linear_spec": false,
121
+ "precompute_num_workers": 0,
122
+ "start_by_longest": false,
123
+ "datasets": [
124
+ {
125
+ "name": "indictts",
126
+ "path": "/home/ttsteam/datasets/indictts/brx",
127
+ "meta_file_train": "metadata_train.csv",
128
+ "ignored_speakers": null,
129
+ "language": "brx",
130
+ "meta_file_val": "metadata_test.csv",
131
+ "meta_file_attn_mask": ""
132
+ }
133
+ ],
134
+ "test_sentences": [
135
+ "\u0917\u093e\u0935\u0928\u093f \u0917\u094b\u091c\u093e\u092e \u0917\u093e\u092e\u093f \u0928\u0935\u0925\u093f\u0916\u094c \u0939\u0930\u0916\u093e\u092c \u0928\u093e\u0917\u093e\u0930\u0928\u093e\u0928\u0948 \u0917\u094b\u0926\u093e\u0928 \u0939\u093e\u0926\u093e\u0928\u093e\u0935 \u0917\u093e\u0935\u0916\u094c \u0926\u093f\u0926\u094b\u092e\u0948 \u092b\u0938\u0902\u0925\u093e \u092b\u093f\u0924\u094d\u0930\u093e\u092f \u0939\u093e\u092c\u093e\u092f\u093e \u091c\u094b\u092c\u094b\u0926 \u0917\u094b\u092c\u094d\u0930\u093e\u092c \u091c\u093e\u092f\u094b\u0932\u0948 \u0917\u094b\u092e\u091c\u094b\u0930",
136
+ "\u0938\u093e\u0928\u0939\u093e\u092c\u0926\u094b\u0902 \u0906\u0902 \u092e\u094b\u0925\u0947 \u092e\u094b\u0925\u094b"
137
+ ],
138
+ "eval_split_max_size": null,
139
+ "eval_split_size": 0.01,
140
+ "use_speaker_weighted_sampler": false,
141
+ "speaker_weighted_sampler_alpha": 1.0,
142
+ "use_language_weighted_sampler": false,
143
+ "language_weighted_sampler_alpha": 1.0,
144
+ "use_length_weighted_sampler": false,
145
+ "length_weighted_sampler_alpha": 1.0,
146
+ "base_model": "forward_tts",
147
+ "model_args": {
148
+ "num_chars": 98,
149
+ "out_channels": 80,
150
+ "hidden_channels": 512,
151
+ "use_aligner": true,
152
+ "use_pitch": true,
153
+ "pitch_predictor_hidden_channels": 256,
154
+ "pitch_predictor_kernel_size": 3,
155
+ "pitch_predictor_dropout_p": 0.1,
156
+ "pitch_embedding_kernel_size": 3,
157
+ "duration_predictor_hidden_channels": 256,
158
+ "duration_predictor_kernel_size": 3,
159
+ "duration_predictor_dropout_p": 0.1,
160
+ "positional_encoding": true,
161
+ "poisitonal_encoding_use_scale": true,
162
+ "length_scale": 1,
163
+ "encoder_type": "fftransformer",
164
+ "encoder_params": {
165
+ "hidden_channels_ffn": 1024,
166
+ "num_heads": 1,
167
+ "num_layers": 6,
168
+ "dropout_p": 0.1
169
+ },
170
+ "decoder_type": "fftransformer",
171
+ "decoder_params": {
172
+ "hidden_channels_ffn": 1024,
173
+ "num_heads": 1,
174
+ "num_layers": 6,
175
+ "dropout_p": 0.1
176
+ },
177
+ "detach_duration_predictor": false,
178
+ "max_duration": 75,
179
+ "num_speakers": 2,
180
+ "use_speaker_embedding": true,
181
+ "speakers_file": "models/v1/brx/fastpitch/speakers.pth",
182
+ "use_d_vector_file": false,
183
+ "d_vector_dim": 512,
184
+ "d_vector_file": null,
185
+ "use_speaker_encoder_as_loss": false,
186
+ "speaker_encoder_config_path": "",
187
+ "speaker_encoder_model_path": "",
188
+ "vocoder_path": null,
189
+ "vocoder_config_path": null,
190
+ "use_separate_optimizers": false
191
+ },
192
+ "return_wav": false,
193
+ "num_speakers": 2,
194
+ "speakers_file": "models/v1/brx/fastpitch/speakers.pth",
195
+ "use_speaker_embedding": true,
196
+ "use_d_vector_file": false,
197
+ "d_vector_file": "",
198
+ "d_vector_dim": 512,
199
+ "spec_loss_type": "mse",
200
+ "duration_loss_type": "mse",
201
+ "use_ssim_loss": false,
202
+ "ssim_loss_alpha": 1.0,
203
+ "spec_loss_alpha": 1.0,
204
+ "aligner_loss_alpha": 1.0,
205
+ "pitch_loss_alpha": 0.1,
206
+ "dur_loss_alpha": 0.1,
207
+ "binary_align_loss_alpha": 0.1,
208
+ "spk_encoder_loss_alpha": 0.1,
209
+ "binary_loss_warmup_epochs": 150,
210
+ "aligner_epochs": 0,
211
+ "min_seq_len": 13,
212
+ "max_seq_len": 500000,
213
+ "r": 1,
214
+ "f0_cache_path": "output_indic_fastpitch/brx/f0_cache"
215
+ }
brx_fastpitch_speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8879ecae8717702a1e0aa62e6a167a146181e88fb7ba8a54d1a4770a98f3372
3
+ size 431
brx_hifigan_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:278fc693b102a7a33596ec62c9b3cfa47e1fe03e146ed336066d10f74febc0eb
3
+ size 1016384316
brx_hifigan_config.json ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "indic_vocoders",
3
+ "logger_uri": null,
4
+ "run_name": "brx_hifigan_all",
5
+ "project_name": "indic-vocoders",
6
+ "run_description": "None",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": null,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": "loss_1",
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:10008",
24
+ "mixed_precision": true,
25
+ "epochs": 5000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": [
29
+ 5,
30
+ 5
31
+ ],
32
+ "scheduler_after_epoch": true,
33
+ "lr": 0.0001,
34
+ "optimizer": "AdamW",
35
+ "optimizer_params": {
36
+ "betas": [
37
+ 0.8,
38
+ 0.99
39
+ ],
40
+ "weight_decay": 0.0
41
+ },
42
+ "lr_scheduler": null,
43
+ "lr_scheduler_params": null,
44
+ "use_grad_scaler": false,
45
+ "cudnn_enable": true,
46
+ "cudnn_deterministic": false,
47
+ "cudnn_benchmark": false,
48
+ "training_seed": 54321,
49
+ "model": "hifigan",
50
+ "num_loader_workers": 8,
51
+ "num_eval_loader_workers": 8,
52
+ "use_noise_augment": true,
53
+ "audio": {
54
+ "fft_size": 1024,
55
+ "win_length": 1024,
56
+ "hop_length": 256,
57
+ "frame_shift_ms": null,
58
+ "frame_length_ms": null,
59
+ "stft_pad_mode": "reflect",
60
+ "sample_rate": 22050,
61
+ "resample": false,
62
+ "preemphasis": 0.0,
63
+ "ref_level_db": 20,
64
+ "do_sound_norm": false,
65
+ "log_func": "np.log",
66
+ "do_trim_silence": true,
67
+ "trim_db": 60.0,
68
+ "do_rms_norm": false,
69
+ "db_level": null,
70
+ "power": 1.5,
71
+ "griffin_lim_iters": 60,
72
+ "num_mels": 80,
73
+ "mel_fmin": 0.0,
74
+ "mel_fmax": 8000,
75
+ "spec_gain": 1.0,
76
+ "do_amp_to_db_linear": true,
77
+ "do_amp_to_db_mel": true,
78
+ "pitch_fmax": 640.0,
79
+ "pitch_fmin": 0.0,
80
+ "signal_norm": false,
81
+ "min_level_db": -100,
82
+ "symmetric_norm": true,
83
+ "max_norm": 4.0,
84
+ "clip_norm": true,
85
+ "stats_path": null
86
+ },
87
+ "eval_split_size": 10,
88
+ "data_path": "../../datasets/indictts/brx",
89
+ "feature_path": null,
90
+ "seq_len": 8192,
91
+ "pad_short": 2000,
92
+ "conv_pad": 0,
93
+ "use_cache": false,
94
+ "wd": 1e-06,
95
+ "use_stft_loss": false,
96
+ "use_subband_stft_loss": false,
97
+ "use_mse_gan_loss": true,
98
+ "use_hinge_gan_loss": false,
99
+ "use_feat_match_loss": true,
100
+ "use_l1_spec_loss": true,
101
+ "stft_loss_weight": 0,
102
+ "subband_stft_loss_weight": 0,
103
+ "mse_G_loss_weight": 1,
104
+ "hinge_G_loss_weight": 0,
105
+ "feat_match_loss_weight": 108,
106
+ "l1_spec_loss_weight": 45,
107
+ "stft_loss_params": {
108
+ "n_ffts": [
109
+ 1024,
110
+ 2048,
111
+ 512
112
+ ],
113
+ "hop_lengths": [
114
+ 120,
115
+ 240,
116
+ 50
117
+ ],
118
+ "win_lengths": [
119
+ 600,
120
+ 1200,
121
+ 240
122
+ ]
123
+ },
124
+ "l1_spec_loss_params": {
125
+ "use_mel": true,
126
+ "sample_rate": 22050,
127
+ "n_fft": 1024,
128
+ "hop_length": 256,
129
+ "win_length": 1024,
130
+ "n_mels": 80,
131
+ "mel_fmin": 0.0,
132
+ "mel_fmax": null
133
+ },
134
+ "lr_gen": 0.0001,
135
+ "lr_disc": 0.0001,
136
+ "lr_scheduler_gen": "ExponentialLR",
137
+ "lr_scheduler_gen_params": {
138
+ "gamma": 0.999,
139
+ "last_epoch": -1
140
+ },
141
+ "lr_scheduler_disc": "ExponentialLR",
142
+ "lr_scheduler_disc_params": {
143
+ "gamma": 0.999,
144
+ "last_epoch": -1
145
+ },
146
+ "use_pqmf": false,
147
+ "diff_samples_for_G_and_D": false,
148
+ "discriminator_model": "hifigan_discriminator",
149
+ "generator_model": "hifigan_generator",
150
+ "generator_model_params": {
151
+ "upsample_factors": [
152
+ 8,
153
+ 8,
154
+ 2,
155
+ 2
156
+ ],
157
+ "upsample_kernel_sizes": [
158
+ 16,
159
+ 16,
160
+ 4,
161
+ 4
162
+ ],
163
+ "upsample_initial_channel": 512,
164
+ "resblock_kernel_sizes": [
165
+ 3,
166
+ 7,
167
+ 11
168
+ ],
169
+ "resblock_dilation_sizes": [
170
+ [
171
+ 1,
172
+ 3,
173
+ 5
174
+ ],
175
+ [
176
+ 1,
177
+ 3,
178
+ 5
179
+ ],
180
+ [
181
+ 1,
182
+ 3,
183
+ 5
184
+ ]
185
+ ],
186
+ "resblock_type": "1"
187
+ },
188
+ "github_branch": "* main"
189
+ }
en+hi_fastpitch_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:138b3dbd5cdb18fc38329c583686324cc21051c404ad73c993eac4197fcd6c94
3
+ size 651362501
en+hi_fastpitch_config.json ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output_indic_fastpitch/ie2_hi/",
3
+ "logger_uri": null,
4
+ "run_name": "ie2_hi_fastpitch_indictts_all_codemixed_native_scripts",
5
+ "project_name": "indic-fastpitch",
6
+ "run_description": "codemixed_native_scripts",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": 10000,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:54321",
24
+ "mixed_precision": false,
25
+ "epochs": 2500,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": 5.0,
29
+ "scheduler_after_epoch": true,
30
+ "lr": 0.0001,
31
+ "optimizer": "Adam",
32
+ "optimizer_params": {
33
+ "betas": [
34
+ 0.9,
35
+ 0.998
36
+ ],
37
+ "weight_decay": 1e-06
38
+ },
39
+ "lr_scheduler": "NoamLR",
40
+ "lr_scheduler_params": {
41
+ "warmup_steps": 50,
42
+ "scheduler_after_epoch": true
43
+ },
44
+ "use_grad_scaler": false,
45
+ "cudnn_enable": true,
46
+ "cudnn_deterministic": false,
47
+ "cudnn_benchmark": false,
48
+ "training_seed": 54321,
49
+ "model": "fast_pitch",
50
+ "num_loader_workers": 0,
51
+ "num_eval_loader_workers": 0,
52
+ "use_noise_augment": false,
53
+ "audio": {
54
+ "fft_size": 1024,
55
+ "win_length": 1024,
56
+ "hop_length": 256,
57
+ "frame_shift_ms": null,
58
+ "frame_length_ms": null,
59
+ "stft_pad_mode": "reflect",
60
+ "sample_rate": 22050,
61
+ "resample": false,
62
+ "preemphasis": 0.0,
63
+ "ref_level_db": 20,
64
+ "do_sound_norm": false,
65
+ "log_func": "np.log",
66
+ "do_trim_silence": true,
67
+ "trim_db": 60.0,
68
+ "do_rms_norm": false,
69
+ "db_level": null,
70
+ "power": 1.5,
71
+ "griffin_lim_iters": 60,
72
+ "num_mels": 80,
73
+ "mel_fmin": 0.0,
74
+ "mel_fmax": 8000,
75
+ "spec_gain": 1.0,
76
+ "do_amp_to_db_linear": true,
77
+ "do_amp_to_db_mel": true,
78
+ "pitch_fmax": 640.0,
79
+ "pitch_fmin": 0.0,
80
+ "signal_norm": false,
81
+ "min_level_db": -100,
82
+ "symmetric_norm": true,
83
+ "max_norm": 4.0,
84
+ "clip_norm": true,
85
+ "stats_path": null
86
+ },
87
+ "use_phonemes": false,
88
+ "phonemizer": null,
89
+ "phoneme_language": "en-us",
90
+ "compute_input_seq_cache": false,
91
+ "text_cleaner": "multilingual_cleaners",
92
+ "enable_eos_bos_chars": false,
93
+ "test_sentences_file": "",
94
+ "phoneme_cache_path": "output_indic_fastpitch/ie2_hi/phoneme_cache",
95
+ "characters": {
96
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
97
+ "vocab_dict": null,
98
+ "pad": "<PAD>",
99
+ "eos": "<EOS>",
100
+ "bos": "<BOS>",
101
+ "blank": "<BLNK>",
102
+ "characters": " !%,-.01234568:;?`abcdefghijklmnopqrstuvwxyz\u00a0\u0901\u0902\u0903\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u090f\u0910\u0911\u0913\u0914\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0931\u0932\u0933\u0935\u0936\u0937\u0938\u0939\u093c\u093e\u093f\u0940\u0941\u0942\u0943\u0945\u0947\u0948\u0949\u094b\u094c\u094d\u0958\u0959\u095a\u095b\u095c\u095d\u095e\u0960\u200d\u200e\u2013\u2014\u2026",
103
+ "punctuations": "!\u00a1'(),-.:;\u00bf? ",
104
+ "phonemes": null,
105
+ "is_unique": true,
106
+ "is_sorted": true
107
+ },
108
+ "add_blank": false,
109
+ "batch_group_size": 0,
110
+ "loss_masking": null,
111
+ "sort_by_audio_len": true,
112
+ "min_audio_len": 1,
113
+ "max_audio_len": 441000,
114
+ "min_text_len": 1,
115
+ "max_text_len": 400,
116
+ "compute_f0": true,
117
+ "compute_linear_spec": false,
118
+ "precompute_num_workers": 0,
119
+ "start_by_longest": false,
120
+ "datasets": [
121
+ {
122
+ "name": "indictts",
123
+ "path": "/home/praveen/ttsteam/datasets/indictts/ie2_hi",
124
+ "meta_file_train": "metadata_train_ie2_hi.csv",
125
+ "ignored_speakers": null,
126
+ "language": "ie2_hi",
127
+ "meta_file_val": "metadata_test_ie2_hi.csv",
128
+ "meta_file_attn_mask": ""
129
+ }
130
+ ],
131
+ "test_sentences": [
132
+ "Namaste! I can speak English too."
133
+ ],
134
+ "eval_split_max_size": null,
135
+ "eval_split_size": 0.01,
136
+ "use_speaker_weighted_sampler": false,
137
+ "speaker_weighted_sampler_alpha": 1.0,
138
+ "use_language_weighted_sampler": false,
139
+ "language_weighted_sampler_alpha": 1.0,
140
+ "use_length_weighted_sampler": false,
141
+ "length_weighted_sampler_alpha": 1.0,
142
+ "base_model": "forward_tts",
143
+ "model_args": {
144
+ "num_chars": 137,
145
+ "out_channels": 80,
146
+ "hidden_channels": 512,
147
+ "use_aligner": true,
148
+ "use_pitch": true,
149
+ "teacher_force_using_external_durations": false,
150
+ "pitch_predictor_hidden_channels": 256,
151
+ "pitch_predictor_kernel_size": 3,
152
+ "pitch_predictor_dropout_p": 0.1,
153
+ "pitch_embedding_kernel_size": 3,
154
+ "duration_predictor_hidden_channels": 256,
155
+ "duration_predictor_kernel_size": 3,
156
+ "duration_predictor_dropout_p": 0.1,
157
+ "positional_encoding": true,
158
+ "poisitonal_encoding_use_scale": true,
159
+ "length_scale": 1,
160
+ "encoder_type": "fftransformer",
161
+ "encoder_params": {
162
+ "hidden_channels_ffn": 1024,
163
+ "num_heads": 1,
164
+ "num_layers": 6,
165
+ "dropout_p": 0.1
166
+ },
167
+ "decoder_type": "fftransformer",
168
+ "decoder_params": {
169
+ "hidden_channels_ffn": 1024,
170
+ "num_heads": 1,
171
+ "num_layers": 6,
172
+ "dropout_p": 0.1
173
+ },
174
+ "detach_duration_predictor": false,
175
+ "max_duration": 75,
176
+ "num_speakers": 4,
177
+ "use_speaker_embedding": true,
178
+ "speakers_file": "models/fastpitch/v1/en/speakers.pth",
179
+ "use_d_vector_file": false,
180
+ "d_vector_dim": 512,
181
+ "d_vector_file": null,
182
+ "use_speaker_encoder_as_loss": false,
183
+ "speaker_encoder_config_path": "",
184
+ "speaker_encoder_model_path": "",
185
+ "vocoder_path": null,
186
+ "vocoder_config_path": null
187
+ },
188
+ "return_wav": false,
189
+ "num_speakers": 4,
190
+ "speakers_file": "models/fastpitch/v1/en/speakers.pth",
191
+ "use_speaker_embedding": true,
192
+ "use_d_vector_file": false,
193
+ "d_vector_file": "",
194
+ "d_vector_dim": 512,
195
+ "spec_loss_type": "mse",
196
+ "duration_loss_type": "mse",
197
+ "use_ssim_loss": false,
198
+ "ssim_loss_alpha": 1.0,
199
+ "spec_loss_alpha": 1.0,
200
+ "aligner_loss_alpha": 1.0,
201
+ "pitch_loss_alpha": 0.1,
202
+ "dur_loss_alpha": 0.1,
203
+ "binary_align_loss_alpha": 0.1,
204
+ "spk_encoder_loss_alpha": 0.1,
205
+ "binary_loss_warmup_epochs": 150,
206
+ "aligner_epochs": 2500,
207
+ "min_seq_len": 13,
208
+ "max_seq_len": 500000,
209
+ "r": 1,
210
+ "f0_cache_path": "output_indic_fastpitch/ie2_hi/f0_cache",
211
+ "durations_cache_path": null
212
+ }
en+hi_fastpitch_speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7aae86c97717aa88be8a4b0140842bbf1fe6dd84eb08a00b112d1ae0245ed0b
3
+ size 495
en+hi_hifigan_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b0730a38f75f990a6ecf0aa2cf99c822af821e2240ee943951c9fa365731047
3
+ size 1016384316
en+hi_hifigan_config.json ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "indic_vocoders",
3
+ "logger_uri": null,
4
+ "run_name": "indianenglish_hifigan_all",
5
+ "project_name": "indic-vocoders",
6
+ "run_description": "None",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": null,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": "loss_1",
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:10004",
24
+ "mixed_precision": true,
25
+ "epochs": 5000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": [
29
+ 5,
30
+ 5
31
+ ],
32
+ "scheduler_after_epoch": true,
33
+ "lr": 0.0001,
34
+ "optimizer": "AdamW",
35
+ "optimizer_params": {
36
+ "betas": [
37
+ 0.8,
38
+ 0.99
39
+ ],
40
+ "weight_decay": 0.0
41
+ },
42
+ "lr_scheduler": null,
43
+ "lr_scheduler_params": null,
44
+ "use_grad_scaler": false,
45
+ "cudnn_enable": true,
46
+ "cudnn_deterministic": false,
47
+ "cudnn_benchmark": false,
48
+ "training_seed": 54321,
49
+ "model": "hifigan",
50
+ "num_loader_workers": 8,
51
+ "num_eval_loader_workers": 8,
52
+ "use_noise_augment": true,
53
+ "audio": {
54
+ "fft_size": 1024,
55
+ "win_length": 1024,
56
+ "hop_length": 256,
57
+ "frame_shift_ms": null,
58
+ "frame_length_ms": null,
59
+ "stft_pad_mode": "reflect",
60
+ "sample_rate": 22050,
61
+ "resample": false,
62
+ "preemphasis": 0.0,
63
+ "ref_level_db": 20,
64
+ "do_sound_norm": false,
65
+ "log_func": "np.log",
66
+ "do_trim_silence": true,
67
+ "trim_db": 60.0,
68
+ "do_rms_norm": false,
69
+ "db_level": null,
70
+ "power": 1.5,
71
+ "griffin_lim_iters": 60,
72
+ "num_mels": 80,
73
+ "mel_fmin": 0.0,
74
+ "mel_fmax": 8000,
75
+ "spec_gain": 1.0,
76
+ "do_amp_to_db_linear": true,
77
+ "do_amp_to_db_mel": true,
78
+ "pitch_fmax": 640.0,
79
+ "pitch_fmin": 0.0,
80
+ "signal_norm": false,
81
+ "min_level_db": -100,
82
+ "symmetric_norm": true,
83
+ "max_norm": 4.0,
84
+ "clip_norm": true,
85
+ "stats_path": null
86
+ },
87
+ "eval_split_size": 10,
88
+ "data_path": "../../datasets/indictts/indianenglish",
89
+ "feature_path": null,
90
+ "seq_len": 8192,
91
+ "pad_short": 2000,
92
+ "conv_pad": 0,
93
+ "use_cache": false,
94
+ "wd": 1e-06,
95
+ "use_stft_loss": false,
96
+ "use_subband_stft_loss": false,
97
+ "use_mse_gan_loss": true,
98
+ "use_hinge_gan_loss": false,
99
+ "use_feat_match_loss": true,
100
+ "use_l1_spec_loss": true,
101
+ "stft_loss_weight": 0,
102
+ "subband_stft_loss_weight": 0,
103
+ "mse_G_loss_weight": 1,
104
+ "hinge_G_loss_weight": 0,
105
+ "feat_match_loss_weight": 108,
106
+ "l1_spec_loss_weight": 45,
107
+ "stft_loss_params": {
108
+ "n_ffts": [
109
+ 1024,
110
+ 2048,
111
+ 512
112
+ ],
113
+ "hop_lengths": [
114
+ 120,
115
+ 240,
116
+ 50
117
+ ],
118
+ "win_lengths": [
119
+ 600,
120
+ 1200,
121
+ 240
122
+ ]
123
+ },
124
+ "l1_spec_loss_params": {
125
+ "use_mel": true,
126
+ "sample_rate": 22050,
127
+ "n_fft": 1024,
128
+ "hop_length": 256,
129
+ "win_length": 1024,
130
+ "n_mels": 80,
131
+ "mel_fmin": 0.0,
132
+ "mel_fmax": null
133
+ },
134
+ "lr_gen": 0.0001,
135
+ "lr_disc": 0.0001,
136
+ "lr_scheduler_gen": "ExponentialLR",
137
+ "lr_scheduler_gen_params": {
138
+ "gamma": 0.999,
139
+ "last_epoch": -1
140
+ },
141
+ "lr_scheduler_disc": "ExponentialLR",
142
+ "lr_scheduler_disc_params": {
143
+ "gamma": 0.999,
144
+ "last_epoch": -1
145
+ },
146
+ "use_pqmf": false,
147
+ "diff_samples_for_G_and_D": false,
148
+ "discriminator_model": "hifigan_discriminator",
149
+ "generator_model": "hifigan_generator",
150
+ "generator_model_params": {
151
+ "upsample_factors": [
152
+ 8,
153
+ 8,
154
+ 2,
155
+ 2
156
+ ],
157
+ "upsample_kernel_sizes": [
158
+ 16,
159
+ 16,
160
+ 4,
161
+ 4
162
+ ],
163
+ "upsample_initial_channel": 512,
164
+ "resblock_kernel_sizes": [
165
+ 3,
166
+ 7,
167
+ 11
168
+ ],
169
+ "resblock_dilation_sizes": [
170
+ [
171
+ 1,
172
+ 3,
173
+ 5
174
+ ],
175
+ [
176
+ 1,
177
+ 3,
178
+ 5
179
+ ],
180
+ [
181
+ 1,
182
+ 3,
183
+ 5
184
+ ]
185
+ ],
186
+ "resblock_type": "1"
187
+ },
188
+ "github_branch": "inside_docker"
189
+ }
en_fastpitch_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb878164c8807aeb02ec266e3062ee49bc743205d78704cbaa000c907e3b94a8
3
+ size 651086333
en_fastpitch_config.json ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output_indic_fastpitch/indianenglish",
3
+ "logger_uri": null,
4
+ "run_name": "indianenglish_fastpitch_indictts_all_",
5
+ "project_name": "indic-fastpitch",
6
+ "run_description": "",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": 10000,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:54321",
24
+ "mixed_precision": true,
25
+ "epochs": 2500,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": 5.0,
29
+ "scheduler_after_epoch": true,
30
+ "lr": 0.0001,
31
+ "optimizer": "Adam",
32
+ "optimizer_params": {
33
+ "betas": [
34
+ 0.9,
35
+ 0.998
36
+ ],
37
+ "weight_decay": 1e-06
38
+ },
39
+ "lr_scheduler": "NoamLR",
40
+ "lr_scheduler_params": {
41
+ "warmup_steps": 500,
42
+ "scheduler_after_epoch": true
43
+ },
44
+ "use_grad_scaler": false,
45
+ "cudnn_enable": true,
46
+ "cudnn_deterministic": false,
47
+ "cudnn_benchmark": false,
48
+ "training_seed": 54321,
49
+ "model": "fast_pitch",
50
+ "num_loader_workers": 0,
51
+ "num_eval_loader_workers": 0,
52
+ "use_noise_augment": false,
53
+ "audio": {
54
+ "fft_size": 1024,
55
+ "win_length": 1024,
56
+ "hop_length": 256,
57
+ "frame_shift_ms": null,
58
+ "frame_length_ms": null,
59
+ "stft_pad_mode": "reflect",
60
+ "sample_rate": 22050,
61
+ "resample": false,
62
+ "preemphasis": 0.0,
63
+ "ref_level_db": 20,
64
+ "do_sound_norm": false,
65
+ "log_func": "np.log",
66
+ "do_trim_silence": true,
67
+ "trim_db": 60.0,
68
+ "do_rms_norm": false,
69
+ "db_level": null,
70
+ "power": 1.5,
71
+ "griffin_lim_iters": 60,
72
+ "num_mels": 80,
73
+ "mel_fmin": 0.0,
74
+ "mel_fmax": 8000,
75
+ "spec_gain": 1.0,
76
+ "do_amp_to_db_linear": true,
77
+ "do_amp_to_db_mel": true,
78
+ "pitch_fmax": 640.0,
79
+ "pitch_fmin": 0.0,
80
+ "signal_norm": false,
81
+ "min_level_db": -100,
82
+ "symmetric_norm": true,
83
+ "max_norm": 4.0,
84
+ "clip_norm": true,
85
+ "stats_path": null
86
+ },
87
+ "use_phonemes": false,
88
+ "phonemizer": null,
89
+ "phoneme_language": "en-us",
90
+ "compute_input_seq_cache": false,
91
+ "text_cleaner": "multilingual_cleaners",
92
+ "enable_eos_bos_chars": false,
93
+ "test_sentences_file": "",
94
+ "phoneme_cache_path": "output_indic_fastpitch/indianenglish/phoneme_cache",
95
+ "characters": {
96
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
97
+ "vocab_dict": null,
98
+ "pad": "<PAD>",
99
+ "eos": "<EOS>",
100
+ "bos": "<BOS>",
101
+ "blank": "<BLNK>",
102
+ "characters": " !&,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWYZ`abcdefghijklmnopqrstuvwxyz\u00e2\u02dc\u2013\u2014\u20ac\u2122",
103
+ "punctuations": "!\u00a1'(),-.:;\u00bf? ",
104
+ "phonemes": null,
105
+ "is_unique": true,
106
+ "is_sorted": true
107
+ },
108
+ "add_blank": false,
109
+ "batch_group_size": 0,
110
+ "loss_masking": null,
111
+ "sort_by_audio_len": true,
112
+ "min_audio_len": 1,
113
+ "max_audio_len": 441000,
114
+ "min_text_len": 1,
115
+ "max_text_len": 400,
116
+ "compute_f0": true,
117
+ "compute_linear_spec": false,
118
+ "precompute_num_workers": 0,
119
+ "start_by_longest": false,
120
+ "datasets": [
121
+ {
122
+ "name": "indictts",
123
+ "path": "/home/ttsteam/datasets/indictts/indianenglish",
124
+ "meta_file_train": "metadata_train_ie2.csv",
125
+ "ignored_speakers": null,
126
+ "language": "indianenglish",
127
+ "meta_file_val": "metadata_test_ie2.csv",
128
+ "meta_file_attn_mask": ""
129
+ }
130
+ ],
131
+ "test_sentences": [
132
+ "Namaste! I can speak English too."
133
+ ],
134
+ "eval_split_max_size": null,
135
+ "eval_split_size": 0.01,
136
+ "use_speaker_weighted_sampler": false,
137
+ "speaker_weighted_sampler_alpha": 1.0,
138
+ "use_language_weighted_sampler": false,
139
+ "language_weighted_sampler_alpha": 1.0,
140
+ "use_length_weighted_sampler": false,
141
+ "length_weighted_sampler_alpha": 1.0,
142
+ "base_model": "forward_tts",
143
+ "model_args": {
144
+ "num_chars": 92,
145
+ "out_channels": 80,
146
+ "hidden_channels": 512,
147
+ "use_aligner": true,
148
+ "use_pitch": true,
149
+ "pitch_predictor_hidden_channels": 256,
150
+ "pitch_predictor_kernel_size": 3,
151
+ "pitch_predictor_dropout_p": 0.1,
152
+ "pitch_embedding_kernel_size": 3,
153
+ "duration_predictor_hidden_channels": 256,
154
+ "duration_predictor_kernel_size": 3,
155
+ "duration_predictor_dropout_p": 0.1,
156
+ "positional_encoding": true,
157
+ "poisitonal_encoding_use_scale": true,
158
+ "length_scale": 1,
159
+ "encoder_type": "fftransformer",
160
+ "encoder_params": {
161
+ "hidden_channels_ffn": 1024,
162
+ "num_heads": 1,
163
+ "num_layers": 6,
164
+ "dropout_p": 0.1
165
+ },
166
+ "decoder_type": "fftransformer",
167
+ "decoder_params": {
168
+ "hidden_channels_ffn": 1024,
169
+ "num_heads": 1,
170
+ "num_layers": 6,
171
+ "dropout_p": 0.1
172
+ },
173
+ "detach_duration_predictor": false,
174
+ "max_duration": 75,
175
+ "num_speakers": 2,
176
+ "use_speaker_embedding": true,
177
+ "speakers_file": "models/v1/en/fastpitch/speakers.pth",
178
+ "use_d_vector_file": false,
179
+ "d_vector_dim": 512,
180
+ "d_vector_file": null,
181
+ "use_speaker_encoder_as_loss": false,
182
+ "speaker_encoder_config_path": "",
183
+ "speaker_encoder_model_path": "",
184
+ "vocoder_path": null,
185
+ "vocoder_config_path": null
186
+ },
187
+ "return_wav": false,
188
+ "num_speakers": 2,
189
+ "speakers_file": "models/v1/en/fastpitch/speakers.pth",
190
+ "use_speaker_embedding": true,
191
+ "use_d_vector_file": false,
192
+ "d_vector_file": "",
193
+ "d_vector_dim": 512,
194
+ "spec_loss_type": "mse",
195
+ "duration_loss_type": "mse",
196
+ "use_ssim_loss": false,
197
+ "ssim_loss_alpha": 1.0,
198
+ "spec_loss_alpha": 1.0,
199
+ "aligner_loss_alpha": 1.0,
200
+ "pitch_loss_alpha": 0.1,
201
+ "dur_loss_alpha": 0.1,
202
+ "binary_align_loss_alpha": 0.1,
203
+ "spk_encoder_loss_alpha": 0.1,
204
+ "binary_loss_warmup_epochs": 150,
205
+ "aligner_epochs": 2500,
206
+ "min_seq_len": 13,
207
+ "max_seq_len": 500000,
208
+ "r": 1,
209
+ "f0_cache_path": "output_indic_fastpitch/indianenglish/f0_cache"
210
+ }
en_fastpitch_speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c561da454a08756ed5223bfd6682b7f966176b0d8b8bdb41cd1456fdf4e32f2
3
+ size 431
en_hifigan_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b0730a38f75f990a6ecf0aa2cf99c822af821e2240ee943951c9fa365731047
3
+ size 1016384316
en_hifigan_config.json ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "indic_vocoders",
3
+ "logger_uri": null,
4
+ "run_name": "indianenglish_hifigan_all",
5
+ "project_name": "indic-vocoders",
6
+ "run_description": "None",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": null,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": "loss_1",
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:10004",
24
+ "mixed_precision": true,
25
+ "epochs": 5000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": [
29
+ 5,
30
+ 5
31
+ ],
32
+ "scheduler_after_epoch": true,
33
+ "lr": 0.0001,
34
+ "optimizer": "AdamW",
35
+ "optimizer_params": {
36
+ "betas": [
37
+ 0.8,
38
+ 0.99
39
+ ],
40
+ "weight_decay": 0.0
41
+ },
42
+ "lr_scheduler": null,
43
+ "lr_scheduler_params": null,
44
+ "use_grad_scaler": false,
45
+ "cudnn_enable": true,
46
+ "cudnn_deterministic": false,
47
+ "cudnn_benchmark": false,
48
+ "training_seed": 54321,
49
+ "model": "hifigan",
50
+ "num_loader_workers": 8,
51
+ "num_eval_loader_workers": 8,
52
+ "use_noise_augment": true,
53
+ "audio": {
54
+ "fft_size": 1024,
55
+ "win_length": 1024,
56
+ "hop_length": 256,
57
+ "frame_shift_ms": null,
58
+ "frame_length_ms": null,
59
+ "stft_pad_mode": "reflect",
60
+ "sample_rate": 22050,
61
+ "resample": false,
62
+ "preemphasis": 0.0,
63
+ "ref_level_db": 20,
64
+ "do_sound_norm": false,
65
+ "log_func": "np.log",
66
+ "do_trim_silence": true,
67
+ "trim_db": 60.0,
68
+ "do_rms_norm": false,
69
+ "db_level": null,
70
+ "power": 1.5,
71
+ "griffin_lim_iters": 60,
72
+ "num_mels": 80,
73
+ "mel_fmin": 0.0,
74
+ "mel_fmax": 8000,
75
+ "spec_gain": 1.0,
76
+ "do_amp_to_db_linear": true,
77
+ "do_amp_to_db_mel": true,
78
+ "pitch_fmax": 640.0,
79
+ "pitch_fmin": 0.0,
80
+ "signal_norm": false,
81
+ "min_level_db": -100,
82
+ "symmetric_norm": true,
83
+ "max_norm": 4.0,
84
+ "clip_norm": true,
85
+ "stats_path": null
86
+ },
87
+ "eval_split_size": 10,
88
+ "data_path": "../../datasets/indictts/indianenglish",
89
+ "feature_path": null,
90
+ "seq_len": 8192,
91
+ "pad_short": 2000,
92
+ "conv_pad": 0,
93
+ "use_cache": false,
94
+ "wd": 1e-06,
95
+ "use_stft_loss": false,
96
+ "use_subband_stft_loss": false,
97
+ "use_mse_gan_loss": true,
98
+ "use_hinge_gan_loss": false,
99
+ "use_feat_match_loss": true,
100
+ "use_l1_spec_loss": true,
101
+ "stft_loss_weight": 0,
102
+ "subband_stft_loss_weight": 0,
103
+ "mse_G_loss_weight": 1,
104
+ "hinge_G_loss_weight": 0,
105
+ "feat_match_loss_weight": 108,
106
+ "l1_spec_loss_weight": 45,
107
+ "stft_loss_params": {
108
+ "n_ffts": [
109
+ 1024,
110
+ 2048,
111
+ 512
112
+ ],
113
+ "hop_lengths": [
114
+ 120,
115
+ 240,
116
+ 50
117
+ ],
118
+ "win_lengths": [
119
+ 600,
120
+ 1200,
121
+ 240
122
+ ]
123
+ },
124
+ "l1_spec_loss_params": {
125
+ "use_mel": true,
126
+ "sample_rate": 22050,
127
+ "n_fft": 1024,
128
+ "hop_length": 256,
129
+ "win_length": 1024,
130
+ "n_mels": 80,
131
+ "mel_fmin": 0.0,
132
+ "mel_fmax": null
133
+ },
134
+ "lr_gen": 0.0001,
135
+ "lr_disc": 0.0001,
136
+ "lr_scheduler_gen": "ExponentialLR",
137
+ "lr_scheduler_gen_params": {
138
+ "gamma": 0.999,
139
+ "last_epoch": -1
140
+ },
141
+ "lr_scheduler_disc": "ExponentialLR",
142
+ "lr_scheduler_disc_params": {
143
+ "gamma": 0.999,
144
+ "last_epoch": -1
145
+ },
146
+ "use_pqmf": false,
147
+ "diff_samples_for_G_and_D": false,
148
+ "discriminator_model": "hifigan_discriminator",
149
+ "generator_model": "hifigan_generator",
150
+ "generator_model_params": {
151
+ "upsample_factors": [
152
+ 8,
153
+ 8,
154
+ 2,
155
+ 2
156
+ ],
157
+ "upsample_kernel_sizes": [
158
+ 16,
159
+ 16,
160
+ 4,
161
+ 4
162
+ ],
163
+ "upsample_initial_channel": 512,
164
+ "resblock_kernel_sizes": [
165
+ 3,
166
+ 7,
167
+ 11
168
+ ],
169
+ "resblock_dilation_sizes": [
170
+ [
171
+ 1,
172
+ 3,
173
+ 5
174
+ ],
175
+ [
176
+ 1,
177
+ 3,
178
+ 5
179
+ ],
180
+ [
181
+ 1,
182
+ 3,
183
+ 5
184
+ ]
185
+ ],
186
+ "resblock_type": "1"
187
+ },
188
+ "github_branch": "inside_docker"
189
+ }
gu_fastpitch_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82f853f96a5e41ea00e526611b4cd0b8a343063c6dd8a1800bbe65921fe1ae89
3
+ size 637461273
gu_fastpitch_config.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output_indic_fastpitch/gu",
3
+ "logger_uri": null,
4
+ "run_name": "gu_fastpitch_indictts_all_align_off",
5
+ "project_name": "indic-fastpitch-stage2",
6
+ "run_description": "align_off",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": 10000,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:54321",
24
+ "mixed_precision": true,
25
+ "epochs": 1000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": 5.0,
29
+ "scheduler_after_epoch": true,
30
+ "lr": 0.0001,
31
+ "optimizer": "Adam",
32
+ "optimizer_params": {
33
+ "betas": [
34
+ 0.9,
35
+ 0.998
36
+ ],
37
+ "weight_decay": 1e-06
38
+ },
39
+ "lr_scheduler": "NoamLR",
40
+ "lr_scheduler_params": {
41
+ "warmup_steps": 4000
42
+ },
43
+ "lr_scheduler_aligner": "NoamLR",
44
+ "lr_scheduler_aligner_params": {
45
+ "warmup_steps": 4000
46
+ },
47
+ "use_grad_scaler": false,
48
+ "cudnn_enable": true,
49
+ "cudnn_deterministic": false,
50
+ "cudnn_benchmark": false,
51
+ "training_seed": 54321,
52
+ "model": "fast_pitch",
53
+ "num_loader_workers": 0,
54
+ "num_eval_loader_workers": 0,
55
+ "use_noise_augment": false,
56
+ "audio": {
57
+ "fft_size": 1024,
58
+ "win_length": 1024,
59
+ "hop_length": 256,
60
+ "frame_shift_ms": null,
61
+ "frame_length_ms": null,
62
+ "stft_pad_mode": "reflect",
63
+ "sample_rate": 22050,
64
+ "resample": false,
65
+ "preemphasis": 0.0,
66
+ "ref_level_db": 20,
67
+ "do_sound_norm": false,
68
+ "log_func": "np.log",
69
+ "do_trim_silence": true,
70
+ "trim_db": 60.0,
71
+ "do_rms_norm": false,
72
+ "db_level": null,
73
+ "power": 1.5,
74
+ "griffin_lim_iters": 60,
75
+ "num_mels": 80,
76
+ "mel_fmin": 0.0,
77
+ "mel_fmax": 8000,
78
+ "spec_gain": 1.0,
79
+ "do_amp_to_db_linear": true,
80
+ "do_amp_to_db_mel": true,
81
+ "pitch_fmax": 640.0,
82
+ "pitch_fmin": 0.0,
83
+ "signal_norm": false,
84
+ "min_level_db": -100,
85
+ "symmetric_norm": true,
86
+ "max_norm": 4.0,
87
+ "clip_norm": true,
88
+ "stats_path": null
89
+ },
90
+ "use_phonemes": false,
91
+ "phonemizer": null,
92
+ "phoneme_language": "en-us",
93
+ "compute_input_seq_cache": false,
94
+ "text_cleaner": "multilingual_cleaners",
95
+ "enable_eos_bos_chars": false,
96
+ "test_sentences_file": "",
97
+ "phoneme_cache_path": "output_indic_fastpitch/gu/phoneme_cache",
98
+ "characters": {
99
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
100
+ "vocab_dict": null,
101
+ "pad": "<PAD>",
102
+ "eos": "<EOS>",
103
+ "bos": "<BOS>",
104
+ "blank": "<BLNK>",
105
+ "characters": " !',-.:;?m\u00a0\u0964\u0a81\u0a82\u0a83\u0a85\u0a86\u0a87\u0a88\u0a89\u0a8a\u0a8b\u0a8d\u0a8f\u0a90\u0a91\u0a93\u0a94\u0a95\u0a96\u0a97\u0a98\u0a9a\u0a9b\u0a9c\u0a9d\u0a9e\u0a9f\u0aa0\u0aa1\u0aa2\u0aa3\u0aa4\u0aa5\u0aa6\u0aa7\u0aa8\u0aaa\u0aab\u0aac\u0aad\u0aae\u0aaf\u0ab0\u0ab2\u0ab3\u0ab5\u0ab6\u0ab7\u0ab8\u0ab9\u0abe\u0abf\u0ac0\u0ac1\u0ac2\u0ac3\u0ac4\u0ac5\u0ac7\u0ac8\u0ac9\u0acb\u0acc\u0acd\u0ad0\u0ae0\u0ae7\u0ae8\u0aea\u0aeb\u200c\u2013\u2018\u2019\u2026\ufeff",
106
+ "punctuations": "!\u00a1'(),-.:;\u00bf? ",
107
+ "phonemes": null,
108
+ "is_unique": true,
109
+ "is_sorted": true
110
+ },
111
+ "add_blank": false,
112
+ "batch_group_size": 0,
113
+ "loss_masking": null,
114
+ "sort_by_audio_len": true,
115
+ "min_audio_len": 1,
116
+ "max_audio_len": 441000,
117
+ "min_text_len": 1,
118
+ "max_text_len": 400,
119
+ "compute_f0": true,
120
+ "compute_linear_spec": false,
121
+ "precompute_num_workers": 0,
122
+ "start_by_longest": false,
123
+ "datasets": [
124
+ {
125
+ "name": "indictts",
126
+ "path": "/home/ttsteam/datasets/indictts/gu",
127
+ "meta_file_train": "metadata_train.csv",
128
+ "ignored_speakers": null,
129
+ "language": "gu",
130
+ "meta_file_val": "metadata_test.csv",
131
+ "meta_file_attn_mask": ""
132
+ }
133
+ ],
134
+ "test_sentences": [
135
+ "\u0a93\u0a97\u0aa3\u0ac0\u0ab8\u0acb \u0a9b\u0aa4\u0acd\u0ab0\u0ac0\u0ab8 \u0aae\u0abe\u0a82, \u0aaa\u0acd\u0ab0\u0aa5\u0aae\u0ab5\u0abe\u0ab0, \u0a8f\u0a95\u0acd\u0ab0\u0ac7\u0ab2\u0ac0\u0a95 \u0ab8\u0ac7\u0aab\u0a9f\u0ac0 \u0a97\u0acd\u0ab2\u0abe\u0ab8\u0aa8\u0ac1\u0a82, \u0a89\u0aa4\u0acd\u0aaa\u0abe\u0aa6\u0aa8, \u0ab6\u0ab0\u0ac1 \u0aa5\u0a88 \u0a97\u0aaf\u0ac1\u0a82.",
136
+ "\u0ab5\u0acd\u0aaf\u0abe\u0aaf\u0abe\u0aae \u0aaa\u0a9b\u0ac0 \u0aaa\u0acd\u0ab0\u0acb\u0a9f\u0ac0\u0aa8 \u0ab2\u0ac7\u0ab5\u0abe\u0aa5\u0ac0, \u0ab8\u0acd\u0aa8\u0abe\u0aaf\u0ac1\u0aa8\u0ac0 \u0a9c\u0ac7 \u0aaa\u0ac7\u0ab6\u0ac0\u0aaf\u0acb\u0aa8\u0ac7 \u0ab9\u0abe\u0aa8\u0abf \u0aaa\u0acd\u0ab9\u0acb\u0a82\u0a9a\u0ac0 \u0ab9\u0acb\u0aaf \u0a9b\u0ac7."
137
+ ],
138
+ "eval_split_max_size": null,
139
+ "eval_split_size": 0.01,
140
+ "use_speaker_weighted_sampler": false,
141
+ "speaker_weighted_sampler_alpha": 1.0,
142
+ "use_language_weighted_sampler": false,
143
+ "language_weighted_sampler_alpha": 1.0,
144
+ "use_length_weighted_sampler": false,
145
+ "length_weighted_sampler_alpha": 1.0,
146
+ "base_model": "forward_tts",
147
+ "model_args": {
148
+ "num_chars": 102,
149
+ "out_channels": 80,
150
+ "hidden_channels": 512,
151
+ "use_aligner": true,
152
+ "use_pitch": true,
153
+ "pitch_predictor_hidden_channels": 256,
154
+ "pitch_predictor_kernel_size": 3,
155
+ "pitch_predictor_dropout_p": 0.1,
156
+ "pitch_embedding_kernel_size": 3,
157
+ "duration_predictor_hidden_channels": 256,
158
+ "duration_predictor_kernel_size": 3,
159
+ "duration_predictor_dropout_p": 0.1,
160
+ "positional_encoding": true,
161
+ "poisitonal_encoding_use_scale": true,
162
+ "length_scale": 1,
163
+ "encoder_type": "fftransformer",
164
+ "encoder_params": {
165
+ "hidden_channels_ffn": 1024,
166
+ "num_heads": 1,
167
+ "num_layers": 6,
168
+ "dropout_p": 0.1
169
+ },
170
+ "decoder_type": "fftransformer",
171
+ "decoder_params": {
172
+ "hidden_channels_ffn": 1024,
173
+ "num_heads": 1,
174
+ "num_layers": 6,
175
+ "dropout_p": 0.1
176
+ },
177
+ "detach_duration_predictor": false,
178
+ "max_duration": 75,
179
+ "num_speakers": 2,
180
+ "use_speaker_embedding": true,
181
+ "speakers_file": "models/v1/gu/fastpitch/speakers.pth",
182
+ "use_d_vector_file": false,
183
+ "d_vector_dim": 512,
184
+ "d_vector_file": null,
185
+ "use_speaker_encoder_as_loss": false,
186
+ "speaker_encoder_config_path": "",
187
+ "speaker_encoder_model_path": "",
188
+ "vocoder_path": null,
189
+ "vocoder_config_path": null,
190
+ "use_separate_optimizers": false
191
+ },
192
+ "return_wav": false,
193
+ "num_speakers": 2,
194
+ "speakers_file": "models/v1/gu/fastpitch/speakers.pth",
195
+ "use_speaker_embedding": true,
196
+ "use_d_vector_file": false,
197
+ "d_vector_file": "",
198
+ "d_vector_dim": 512,
199
+ "spec_loss_type": "mse",
200
+ "duration_loss_type": "mse",
201
+ "use_ssim_loss": false,
202
+ "ssim_loss_alpha": 1.0,
203
+ "spec_loss_alpha": 1.0,
204
+ "aligner_loss_alpha": 1.0,
205
+ "pitch_loss_alpha": 0.1,
206
+ "dur_loss_alpha": 0.1,
207
+ "binary_align_loss_alpha": 0.1,
208
+ "spk_encoder_loss_alpha": 0.1,
209
+ "binary_loss_warmup_epochs": 150,
210
+ "aligner_epochs": 0,
211
+ "min_seq_len": 13,
212
+ "max_seq_len": 500000,
213
+ "r": 1,
214
+ "f0_cache_path": "output_indic_fastpitch/gu/f0_cache"
215
+ }
gu_fastpitch_speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea
3
+ size 431
gu_hifigan_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92d86da63b3c323f1d315b482e951eca12e019039af9dd08cd75d8e28fe26079
3
+ size 1016384316
gu_hifigan_config.json ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "indic_vocoders",
3
+ "logger_uri": null,
4
+ "run_name": "gu_hifigan_all",
5
+ "project_name": "indic-vocoders",
6
+ "run_description": "None",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": null,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": "loss_1",
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:10006",
24
+ "mixed_precision": true,
25
+ "epochs": 5000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": [
29
+ 5,
30
+ 5
31
+ ],
32
+ "scheduler_after_epoch": true,
33
+ "lr": 0.0001,
34
+ "optimizer": "AdamW",
35
+ "optimizer_params": {
36
+ "betas": [
37
+ 0.8,
38
+ 0.99
39
+ ],
40
+ "weight_decay": 0.0
41
+ },
42
+ "lr_scheduler": null,
43
+ "lr_scheduler_params": null,
44
+ "use_grad_scaler": false,
45
+ "cudnn_enable": true,
46
+ "cudnn_deterministic": false,
47
+ "cudnn_benchmark": false,
48
+ "training_seed": 54321,
49
+ "model": "hifigan",
50
+ "num_loader_workers": 8,
51
+ "num_eval_loader_workers": 8,
52
+ "use_noise_augment": true,
53
+ "audio": {
54
+ "fft_size": 1024,
55
+ "win_length": 1024,
56
+ "hop_length": 256,
57
+ "frame_shift_ms": null,
58
+ "frame_length_ms": null,
59
+ "stft_pad_mode": "reflect",
60
+ "sample_rate": 22050,
61
+ "resample": false,
62
+ "preemphasis": 0.0,
63
+ "ref_level_db": 20,
64
+ "do_sound_norm": false,
65
+ "log_func": "np.log",
66
+ "do_trim_silence": true,
67
+ "trim_db": 60.0,
68
+ "do_rms_norm": false,
69
+ "db_level": null,
70
+ "power": 1.5,
71
+ "griffin_lim_iters": 60,
72
+ "num_mels": 80,
73
+ "mel_fmin": 0.0,
74
+ "mel_fmax": 8000,
75
+ "spec_gain": 1.0,
76
+ "do_amp_to_db_linear": true,
77
+ "do_amp_to_db_mel": true,
78
+ "pitch_fmax": 640.0,
79
+ "pitch_fmin": 0.0,
80
+ "signal_norm": false,
81
+ "min_level_db": -100,
82
+ "symmetric_norm": true,
83
+ "max_norm": 4.0,
84
+ "clip_norm": true,
85
+ "stats_path": null
86
+ },
87
+ "eval_split_size": 10,
88
+ "data_path": "../../datasets/indictts/gu",
89
+ "feature_path": null,
90
+ "seq_len": 8192,
91
+ "pad_short": 2000,
92
+ "conv_pad": 0,
93
+ "use_cache": false,
94
+ "wd": 1e-06,
95
+ "use_stft_loss": false,
96
+ "use_subband_stft_loss": false,
97
+ "use_mse_gan_loss": true,
98
+ "use_hinge_gan_loss": false,
99
+ "use_feat_match_loss": true,
100
+ "use_l1_spec_loss": true,
101
+ "stft_loss_weight": 0,
102
+ "subband_stft_loss_weight": 0,
103
+ "mse_G_loss_weight": 1,
104
+ "hinge_G_loss_weight": 0,
105
+ "feat_match_loss_weight": 108,
106
+ "l1_spec_loss_weight": 45,
107
+ "stft_loss_params": {
108
+ "n_ffts": [
109
+ 1024,
110
+ 2048,
111
+ 512
112
+ ],
113
+ "hop_lengths": [
114
+ 120,
115
+ 240,
116
+ 50
117
+ ],
118
+ "win_lengths": [
119
+ 600,
120
+ 1200,
121
+ 240
122
+ ]
123
+ },
124
+ "l1_spec_loss_params": {
125
+ "use_mel": true,
126
+ "sample_rate": 22050,
127
+ "n_fft": 1024,
128
+ "hop_length": 256,
129
+ "win_length": 1024,
130
+ "n_mels": 80,
131
+ "mel_fmin": 0.0,
132
+ "mel_fmax": null
133
+ },
134
+ "lr_gen": 0.0001,
135
+ "lr_disc": 0.0001,
136
+ "lr_scheduler_gen": "ExponentialLR",
137
+ "lr_scheduler_gen_params": {
138
+ "gamma": 0.999,
139
+ "last_epoch": -1
140
+ },
141
+ "lr_scheduler_disc": "ExponentialLR",
142
+ "lr_scheduler_disc_params": {
143
+ "gamma": 0.999,
144
+ "last_epoch": -1
145
+ },
146
+ "use_pqmf": false,
147
+ "diff_samples_for_G_and_D": false,
148
+ "discriminator_model": "hifigan_discriminator",
149
+ "generator_model": "hifigan_generator",
150
+ "generator_model_params": {
151
+ "upsample_factors": [
152
+ 8,
153
+ 8,
154
+ 2,
155
+ 2
156
+ ],
157
+ "upsample_kernel_sizes": [
158
+ 16,
159
+ 16,
160
+ 4,
161
+ 4
162
+ ],
163
+ "upsample_initial_channel": 512,
164
+ "resblock_kernel_sizes": [
165
+ 3,
166
+ 7,
167
+ 11
168
+ ],
169
+ "resblock_dilation_sizes": [
170
+ [
171
+ 1,
172
+ 3,
173
+ 5
174
+ ],
175
+ [
176
+ 1,
177
+ 3,
178
+ 5
179
+ ],
180
+ [
181
+ 1,
182
+ 3,
183
+ 5
184
+ ]
185
+ ],
186
+ "resblock_type": "1"
187
+ },
188
+ "github_branch": "* main"
189
+ }
hi_fastpitch_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0f8f98e3d9eaf1cf842821087de31889df67635e77dd74e4a967fd5b3ada8cd
3
+ size 637455449
hi_fastpitch_config.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output_indic_fastpitch/hi",
3
+ "logger_uri": null,
4
+ "run_name": "hi_fastpitch_indictts_all_align_off",
5
+ "project_name": "indic-fastpitch-stage2",
6
+ "run_description": "align_off",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": 10000,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:54321",
24
+ "mixed_precision": true,
25
+ "epochs": 1000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": 5.0,
29
+ "scheduler_after_epoch": true,
30
+ "lr": 0.0001,
31
+ "optimizer": "Adam",
32
+ "optimizer_params": {
33
+ "betas": [
34
+ 0.9,
35
+ 0.998
36
+ ],
37
+ "weight_decay": 1e-06
38
+ },
39
+ "lr_scheduler": "NoamLR",
40
+ "lr_scheduler_params": {
41
+ "warmup_steps": 4000
42
+ },
43
+ "lr_scheduler_aligner": "NoamLR",
44
+ "lr_scheduler_aligner_params": {
45
+ "warmup_steps": 4000
46
+ },
47
+ "use_grad_scaler": false,
48
+ "cudnn_enable": true,
49
+ "cudnn_deterministic": false,
50
+ "cudnn_benchmark": false,
51
+ "training_seed": 54321,
52
+ "model": "fast_pitch",
53
+ "num_loader_workers": 0,
54
+ "num_eval_loader_workers": 0,
55
+ "use_noise_augment": false,
56
+ "audio": {
57
+ "fft_size": 1024,
58
+ "win_length": 1024,
59
+ "hop_length": 256,
60
+ "frame_shift_ms": null,
61
+ "frame_length_ms": null,
62
+ "stft_pad_mode": "reflect",
63
+ "sample_rate": 22050,
64
+ "resample": false,
65
+ "preemphasis": 0.0,
66
+ "ref_level_db": 20,
67
+ "do_sound_norm": false,
68
+ "log_func": "np.log",
69
+ "do_trim_silence": true,
70
+ "trim_db": 60.0,
71
+ "do_rms_norm": false,
72
+ "db_level": null,
73
+ "power": 1.5,
74
+ "griffin_lim_iters": 60,
75
+ "num_mels": 80,
76
+ "mel_fmin": 0.0,
77
+ "mel_fmax": 8000,
78
+ "spec_gain": 1.0,
79
+ "do_amp_to_db_linear": true,
80
+ "do_amp_to_db_mel": true,
81
+ "pitch_fmax": 640.0,
82
+ "pitch_fmin": 0.0,
83
+ "signal_norm": false,
84
+ "min_level_db": -100,
85
+ "symmetric_norm": true,
86
+ "max_norm": 4.0,
87
+ "clip_norm": true,
88
+ "stats_path": null
89
+ },
90
+ "use_phonemes": false,
91
+ "phonemizer": null,
92
+ "phoneme_language": "en-us",
93
+ "compute_input_seq_cache": false,
94
+ "text_cleaner": "multilingual_cleaners",
95
+ "enable_eos_bos_chars": false,
96
+ "test_sentences_file": "",
97
+ "phoneme_cache_path": "output_indic_fastpitch/hi/phoneme_cache",
98
+ "characters": {
99
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
100
+ "vocab_dict": null,
101
+ "pad": "<PAD>",
102
+ "eos": "<EOS>",
103
+ "bos": "<BOS>",
104
+ "blank": "<BLNK>",
105
+ "characters": " !,-.28:;?\u00a0\u0901\u0902\u0903\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u090f\u0910\u0911\u0913\u0914\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0931\u0932\u0933\u0935\u0936\u0937\u0938\u0939\u093c\u093e\u093f\u0940\u0941\u0942\u0943\u0945\u0947\u0948\u0949\u094b\u094c\u094d\u0958\u0959\u095a\u095b\u095c\u095d\u095e\u0960\u200d\u200e\u2013",
106
+ "punctuations": "!\u00a1'(),-.:;\u00bf? ",
107
+ "phonemes": null,
108
+ "is_unique": true,
109
+ "is_sorted": true
110
+ },
111
+ "add_blank": false,
112
+ "batch_group_size": 0,
113
+ "loss_masking": null,
114
+ "sort_by_audio_len": true,
115
+ "min_audio_len": 1,
116
+ "max_audio_len": 441000,
117
+ "min_text_len": 1,
118
+ "max_text_len": 400,
119
+ "compute_f0": true,
120
+ "compute_linear_spec": false,
121
+ "precompute_num_workers": 0,
122
+ "start_by_longest": false,
123
+ "datasets": [
124
+ {
125
+ "name": "indictts",
126
+ "path": "/home/ttsteam/datasets/indictts/hi",
127
+ "meta_file_train": "metadata_train.csv",
128
+ "ignored_speakers": null,
129
+ "language": "hi",
130
+ "meta_file_val": "metadata_test.csv",
131
+ "meta_file_attn_mask": ""
132
+ }
133
+ ],
134
+ "test_sentences": [
135
+ "\u092c\u093f\u0939\u093e\u0930, \u0930\u093e\u091c\u0938\u094d\u0925\u093e\u0928 \u0914\u0930 \u0909\u0924\u094d\u0924\u0930 \u092a\u094d\u0930\u0926\u0947\u0936 \u0938\u0947 \u0932\u0947\u0915\u0930 \u0939\u0930\u093f\u092f\u093e\u0923\u093e, \u092e\u0927\u094d\u092f \u092a\u094d\u0930\u0926\u0947\u0936 \u090f\u0935\u0902 \u0909\u0924\u094d\u0924\u0930\u093e\u0916\u0902\u0921 \u092e\u0947\u0902 \u0938\u0947\u0928\u093e \u092e\u0947\u0902 \u092d\u0930\u094d\u0924\u0940 \u0938\u0947 \u091c\u0941\u0921\u093c\u0940 '\u0905\u0917\u094d\u0928\u093f\u092a\u0925 \u0938\u094d\u0915\u0940\u092e' \u0915\u093e \u0935\u093f\u0930\u094b\u0927 \u091c\u093e\u0930\u0940 \u0939\u0948.",
136
+ "\u0938\u0902\u092f\u0941\u0915\u094d\u0924 \u0905\u0930\u092c \u0905\u092e\u0940\u0930\u093e\u0924 \u092f\u093e\u0928\u0940 \u092f\u0942\u090f\u0908 \u0928\u0947 \u092c\u0941\u0927\u0935\u093e\u0930 \u0915\u094b \u090f\u0915 \u092b\u093c\u0948\u0938\u0932\u093e \u0932\u093f\u092f\u093e \u0915\u093f \u0905\u0917\u0932\u0947 \u091a\u093e\u0930 \u092e\u0939\u0940\u0928\u094b\u0902 \u0924\u0915 \u0935\u094b \u092d\u093e\u0930\u0924 \u0938\u0947 \u0916\u093c\u0930\u0940\u0926\u093e \u0939\u0941\u0906 \u0917\u0947\u0939\u0942\u0901 \u0915\u094b \u0915\u093f\u0938\u0940 \u0914\u0930 \u0915\u094b \u0928\u0939\u0940\u0902 \u092c\u0947\u091a\u0947\u0917\u093e."
137
+ ],
138
+ "eval_split_max_size": null,
139
+ "eval_split_size": 0.01,
140
+ "use_speaker_weighted_sampler": false,
141
+ "speaker_weighted_sampler_alpha": 1.0,
142
+ "use_language_weighted_sampler": false,
143
+ "language_weighted_sampler_alpha": 1.0,
144
+ "use_length_weighted_sampler": false,
145
+ "length_weighted_sampler_alpha": 1.0,
146
+ "base_model": "forward_tts",
147
+ "model_args": {
148
+ "num_chars": 101,
149
+ "out_channels": 80,
150
+ "hidden_channels": 512,
151
+ "use_aligner": true,
152
+ "use_pitch": true,
153
+ "pitch_predictor_hidden_channels": 256,
154
+ "pitch_predictor_kernel_size": 3,
155
+ "pitch_predictor_dropout_p": 0.1,
156
+ "pitch_embedding_kernel_size": 3,
157
+ "duration_predictor_hidden_channels": 256,
158
+ "duration_predictor_kernel_size": 3,
159
+ "duration_predictor_dropout_p": 0.1,
160
+ "positional_encoding": true,
161
+ "poisitonal_encoding_use_scale": true,
162
+ "length_scale": 1,
163
+ "encoder_type": "fftransformer",
164
+ "encoder_params": {
165
+ "hidden_channels_ffn": 1024,
166
+ "num_heads": 1,
167
+ "num_layers": 6,
168
+ "dropout_p": 0.1
169
+ },
170
+ "decoder_type": "fftransformer",
171
+ "decoder_params": {
172
+ "hidden_channels_ffn": 1024,
173
+ "num_heads": 1,
174
+ "num_layers": 6,
175
+ "dropout_p": 0.1
176
+ },
177
+ "detach_duration_predictor": false,
178
+ "max_duration": 75,
179
+ "num_speakers": 2,
180
+ "use_speaker_embedding": true,
181
+ "speakers_file": "models/v1/hi/fastpitch/speakers.pth",
182
+ "use_d_vector_file": false,
183
+ "d_vector_dim": 512,
184
+ "d_vector_file": null,
185
+ "use_speaker_encoder_as_loss": false,
186
+ "speaker_encoder_config_path": "",
187
+ "speaker_encoder_model_path": "",
188
+ "vocoder_path": null,
189
+ "vocoder_config_path": null,
190
+ "use_separate_optimizers": false
191
+ },
192
+ "return_wav": false,
193
+ "num_speakers": 2,
194
+ "speakers_file": "models/v1/hi/fastpitch/speakers.pth",
195
+ "use_speaker_embedding": true,
196
+ "use_d_vector_file": false,
197
+ "d_vector_file": "",
198
+ "d_vector_dim": 512,
199
+ "spec_loss_type": "mse",
200
+ "duration_loss_type": "mse",
201
+ "use_ssim_loss": false,
202
+ "ssim_loss_alpha": 1.0,
203
+ "spec_loss_alpha": 1.0,
204
+ "aligner_loss_alpha": 1.0,
205
+ "pitch_loss_alpha": 0.1,
206
+ "dur_loss_alpha": 0.1,
207
+ "binary_align_loss_alpha": 0.1,
208
+ "spk_encoder_loss_alpha": 0.1,
209
+ "binary_loss_warmup_epochs": 150,
210
+ "aligner_epochs": 0,
211
+ "min_seq_len": 13,
212
+ "max_seq_len": 500000,
213
+ "r": 1,
214
+ "f0_cache_path": "output_indic_fastpitch/hi/f0_cache"
215
+ }
hi_fastpitch_speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea
3
+ size 431
hi_hifigan_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66c11563f376ba9ff247d873f0f26acb9886ba8db8f0db8c20e4ee4770b3cb46
3
+ size 1016383548
hi_hifigan_config.json ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "indic_vocoders",
3
+ "logger_uri": null,
4
+ "run_name": "hi_hifigan_all",
5
+ "project_name": "indic-vocoders",
6
+ "run_description": "None",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": null,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": "loss_1",
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:10007",
24
+ "mixed_precision": true,
25
+ "epochs": 5000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": [
29
+ 5,
30
+ 5
31
+ ],
32
+ "scheduler_after_epoch": true,
33
+ "lr": 0.0001,
34
+ "optimizer": "AdamW",
35
+ "optimizer_params": {
36
+ "betas": [
37
+ 0.8,
38
+ 0.99
39
+ ],
40
+ "weight_decay": 0.0
41
+ },
42
+ "lr_scheduler": null,
43
+ "lr_scheduler_params": null,
44
+ "use_grad_scaler": false,
45
+ "cudnn_enable": true,
46
+ "cudnn_deterministic": false,
47
+ "cudnn_benchmark": false,
48
+ "training_seed": 54321,
49
+ "model": "hifigan",
50
+ "num_loader_workers": 8,
51
+ "num_eval_loader_workers": 8,
52
+ "use_noise_augment": true,
53
+ "audio": {
54
+ "fft_size": 1024,
55
+ "win_length": 1024,
56
+ "hop_length": 256,
57
+ "frame_shift_ms": null,
58
+ "frame_length_ms": null,
59
+ "stft_pad_mode": "reflect",
60
+ "sample_rate": 22050,
61
+ "resample": false,
62
+ "preemphasis": 0.0,
63
+ "ref_level_db": 20,
64
+ "do_sound_norm": false,
65
+ "log_func": "np.log",
66
+ "do_trim_silence": true,
67
+ "trim_db": 60.0,
68
+ "do_rms_norm": false,
69
+ "db_level": null,
70
+ "power": 1.5,
71
+ "griffin_lim_iters": 60,
72
+ "num_mels": 80,
73
+ "mel_fmin": 0.0,
74
+ "mel_fmax": 8000,
75
+ "spec_gain": 1.0,
76
+ "do_amp_to_db_linear": true,
77
+ "do_amp_to_db_mel": true,
78
+ "pitch_fmax": 640.0,
79
+ "pitch_fmin": 0.0,
80
+ "signal_norm": false,
81
+ "min_level_db": -100,
82
+ "symmetric_norm": true,
83
+ "max_norm": 4.0,
84
+ "clip_norm": true,
85
+ "stats_path": null
86
+ },
87
+ "eval_split_size": 10,
88
+ "data_path": "../../datasets/indictts/hi",
89
+ "feature_path": null,
90
+ "seq_len": 8192,
91
+ "pad_short": 2000,
92
+ "conv_pad": 0,
93
+ "use_cache": false,
94
+ "wd": 1e-06,
95
+ "use_stft_loss": false,
96
+ "use_subband_stft_loss": false,
97
+ "use_mse_gan_loss": true,
98
+ "use_hinge_gan_loss": false,
99
+ "use_feat_match_loss": true,
100
+ "use_l1_spec_loss": true,
101
+ "stft_loss_weight": 0,
102
+ "subband_stft_loss_weight": 0,
103
+ "mse_G_loss_weight": 1,
104
+ "hinge_G_loss_weight": 0,
105
+ "feat_match_loss_weight": 108,
106
+ "l1_spec_loss_weight": 45,
107
+ "stft_loss_params": {
108
+ "n_ffts": [
109
+ 1024,
110
+ 2048,
111
+ 512
112
+ ],
113
+ "hop_lengths": [
114
+ 120,
115
+ 240,
116
+ 50
117
+ ],
118
+ "win_lengths": [
119
+ 600,
120
+ 1200,
121
+ 240
122
+ ]
123
+ },
124
+ "l1_spec_loss_params": {
125
+ "use_mel": true,
126
+ "sample_rate": 22050,
127
+ "n_fft": 1024,
128
+ "hop_length": 256,
129
+ "win_length": 1024,
130
+ "n_mels": 80,
131
+ "mel_fmin": 0.0,
132
+ "mel_fmax": null
133
+ },
134
+ "lr_gen": 0.0001,
135
+ "lr_disc": 0.0001,
136
+ "lr_scheduler_gen": "ExponentialLR",
137
+ "lr_scheduler_gen_params": {
138
+ "gamma": 0.999,
139
+ "last_epoch": -1
140
+ },
141
+ "lr_scheduler_disc": "ExponentialLR",
142
+ "lr_scheduler_disc_params": {
143
+ "gamma": 0.999,
144
+ "last_epoch": -1
145
+ },
146
+ "use_pqmf": false,
147
+ "diff_samples_for_G_and_D": false,
148
+ "discriminator_model": "hifigan_discriminator",
149
+ "generator_model": "hifigan_generator",
150
+ "generator_model_params": {
151
+ "upsample_factors": [
152
+ 8,
153
+ 8,
154
+ 2,
155
+ 2
156
+ ],
157
+ "upsample_kernel_sizes": [
158
+ 16,
159
+ 16,
160
+ 4,
161
+ 4
162
+ ],
163
+ "upsample_initial_channel": 512,
164
+ "resblock_kernel_sizes": [
165
+ 3,
166
+ 7,
167
+ 11
168
+ ],
169
+ "resblock_dilation_sizes": [
170
+ [
171
+ 1,
172
+ 3,
173
+ 5
174
+ ],
175
+ [
176
+ 1,
177
+ 3,
178
+ 5
179
+ ],
180
+ [
181
+ 1,
182
+ 3,
183
+ 5
184
+ ]
185
+ ],
186
+ "resblock_type": "1"
187
+ },
188
+ "github_branch": "* main"
189
+ }
kn_fastpitch_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:086ccd302bb73900a66b98ccd3df3b24175bf135eb80ecd966b043bb3f342841
3
+ size 637430893
kn_fastpitch_config.json ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output_indic_fastpitch/kn",
3
+ "logger_uri": null,
4
+ "run_name": "kn_fastpitch_indictts_all_align_off",
5
+ "project_name": "indic-fastpitch-stage2",
6
+ "run_description": "align_off",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": 10000,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:54321",
24
+ "mixed_precision": true,
25
+ "epochs": 1000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": 5.0,
29
+ "scheduler_after_epoch": true,
30
+ "lr": 0.0001,
31
+ "optimizer": "Adam",
32
+ "optimizer_params": {
33
+ "betas": [
34
+ 0.9,
35
+ 0.998
36
+ ],
37
+ "weight_decay": 1e-06
38
+ },
39
+ "lr_scheduler": "NoamLR",
40
+ "lr_scheduler_params": {
41
+ "warmup_steps": 4000
42
+ },
43
+ "use_grad_scaler": false,
44
+ "cudnn_enable": true,
45
+ "cudnn_deterministic": false,
46
+ "cudnn_benchmark": false,
47
+ "training_seed": 54321,
48
+ "model": "fast_pitch",
49
+ "num_loader_workers": 0,
50
+ "num_eval_loader_workers": 0,
51
+ "use_noise_augment": false,
52
+ "audio": {
53
+ "fft_size": 1024,
54
+ "win_length": 1024,
55
+ "hop_length": 256,
56
+ "frame_shift_ms": null,
57
+ "frame_length_ms": null,
58
+ "stft_pad_mode": "reflect",
59
+ "sample_rate": 22050,
60
+ "resample": false,
61
+ "preemphasis": 0.0,
62
+ "ref_level_db": 20,
63
+ "do_sound_norm": false,
64
+ "log_func": "np.log",
65
+ "do_trim_silence": true,
66
+ "trim_db": 60.0,
67
+ "do_rms_norm": false,
68
+ "db_level": null,
69
+ "power": 1.5,
70
+ "griffin_lim_iters": 60,
71
+ "num_mels": 80,
72
+ "mel_fmin": 0.0,
73
+ "mel_fmax": 8000,
74
+ "spec_gain": 1.0,
75
+ "do_amp_to_db_linear": true,
76
+ "do_amp_to_db_mel": true,
77
+ "pitch_fmax": 640.0,
78
+ "pitch_fmin": 0.0,
79
+ "signal_norm": false,
80
+ "min_level_db": -100,
81
+ "symmetric_norm": true,
82
+ "max_norm": 4.0,
83
+ "clip_norm": true,
84
+ "stats_path": null
85
+ },
86
+ "use_phonemes": false,
87
+ "phonemizer": null,
88
+ "phoneme_language": "en-us",
89
+ "compute_input_seq_cache": false,
90
+ "text_cleaner": "multilingual_cleaners",
91
+ "enable_eos_bos_chars": false,
92
+ "test_sentences_file": "",
93
+ "phoneme_cache_path": "output_indic_fastpitch/kn/phoneme_cache",
94
+ "characters": {
95
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
96
+ "vocab_dict": null,
97
+ "pad": "<PAD>",
98
+ "eos": "<EOS>",
99
+ "bos": "<BOS>",
100
+ "blank": "<BLNK>",
101
+ "characters": " !$'+,-.:;?\u0c82\u0c83\u0c85\u0c86\u0c87\u0c88\u0c89\u0c8a\u0c8b\u0c8e\u0c8f\u0c90\u0c92\u0c93\u0c94\u0c95\u0c96\u0c97\u0c98\u0c99\u0c9a\u0c9b\u0c9c\u0c9d\u0c9e\u0c9f\u0ca0\u0ca1\u0ca2\u0ca3\u0ca4\u0ca5\u0ca6\u0ca7\u0ca8\u0caa\u0cab\u0cac\u0cad\u0cae\u0caf\u0cb0\u0cb2\u0cb3\u0cb5\u0cb6\u0cb7\u0cb8\u0cb9\u0cbe\u0cbf\u0cc0\u0cc1\u0cc2\u0cc3\u0cc6\u0cc7\u0cc8\u0cca\u0ccb\u0ccc\u0ccd\u0cd5\u0cd6\u0ce6\u0ce7\u0ce8\u0cef\u2008\u200b\u200c\u200d\u2013\u2018\u2019\u201c\u201d\u2026",
102
+ "punctuations": "!\u00a1'(),-.:;\u00bf? ",
103
+ "phonemes": null,
104
+ "is_unique": true,
105
+ "is_sorted": true
106
+ },
107
+ "add_blank": false,
108
+ "batch_group_size": 0,
109
+ "loss_masking": null,
110
+ "sort_by_audio_len": true,
111
+ "min_audio_len": 1,
112
+ "max_audio_len": 441000,
113
+ "min_text_len": 1,
114
+ "max_text_len": 400,
115
+ "compute_f0": true,
116
+ "compute_linear_spec": false,
117
+ "precompute_num_workers": 0,
118
+ "start_by_longest": false,
119
+ "datasets": [
120
+ {
121
+ "name": "indictts",
122
+ "path": "/home/ttsteam/datasets/indictts/kn",
123
+ "meta_file_train": "metadata_train.csv",
124
+ "ignored_speakers": null,
125
+ "language": "kn",
126
+ "meta_file_val": "metadata_test.csv",
127
+ "meta_file_attn_mask": ""
128
+ }
129
+ ],
130
+ "test_sentences": [
131
+ "\u0caf\u0cbe\u0cb5\u0cc1\u0ca6\u0cc1 \u0ca8\u0cbf\u0c9c \u0caf\u0cbe\u0cb5\u0cc1\u0ca6\u0cc1 \u0cb8\u0cc1\u0cb3\u0ccd\u0cb3\u0cc1 \u0c8e\u0ca8\u0ccd\u0ca8\u0cc1\u0cb5 \u0cac\u0c97\u0ccd\u0c97\u0cc6 \u0c9a\u0cbf\u0c82\u0ca4\u0cbf\u0cb8\u0cbf.",
132
+ "\u0cb6\u0c95\u0ccd\u0ca4\u0cbf \u0c87\u0ca6\u0ccd\u0ca6\u0cb0\u0cc6\u0ca8\u0ccd\u0ca8\u0cca\u0ca1\u0ca8\u0cc6 \u0c9c\u0c97\u0cb3\u0c95\u0ccd\u0c95\u0cc6 \u0cac\u0cbe"
133
+ ],
134
+ "eval_split_max_size": null,
135
+ "eval_split_size": 0.01,
136
+ "use_speaker_weighted_sampler": false,
137
+ "speaker_weighted_sampler_alpha": 1.0,
138
+ "use_language_weighted_sampler": false,
139
+ "language_weighted_sampler_alpha": 1.0,
140
+ "use_length_weighted_sampler": false,
141
+ "length_weighted_sampler_alpha": 1.0,
142
+ "base_model": "forward_tts",
143
+ "model_args": {
144
+ "num_chars": 104,
145
+ "out_channels": 80,
146
+ "hidden_channels": 512,
147
+ "use_aligner": true,
148
+ "use_pitch": true,
149
+ "pitch_predictor_hidden_channels": 256,
150
+ "pitch_predictor_kernel_size": 3,
151
+ "pitch_predictor_dropout_p": 0.1,
152
+ "pitch_embedding_kernel_size": 3,
153
+ "duration_predictor_hidden_channels": 256,
154
+ "duration_predictor_kernel_size": 3,
155
+ "duration_predictor_dropout_p": 0.1,
156
+ "positional_encoding": true,
157
+ "poisitonal_encoding_use_scale": true,
158
+ "length_scale": 1,
159
+ "encoder_type": "fftransformer",
160
+ "encoder_params": {
161
+ "hidden_channels_ffn": 1024,
162
+ "num_heads": 1,
163
+ "num_layers": 6,
164
+ "dropout_p": 0.1
165
+ },
166
+ "decoder_type": "fftransformer",
167
+ "decoder_params": {
168
+ "hidden_channels_ffn": 1024,
169
+ "num_heads": 1,
170
+ "num_layers": 6,
171
+ "dropout_p": 0.1
172
+ },
173
+ "detach_duration_predictor": false,
174
+ "max_duration": 75,
175
+ "num_speakers": 2,
176
+ "use_speaker_embedding": true,
177
+ "speakers_file": "models/v1/kn/fastpitch/speakers.pth",
178
+ "use_d_vector_file": false,
179
+ "d_vector_dim": 512,
180
+ "d_vector_file": null,
181
+ "use_speaker_encoder_as_loss": false,
182
+ "speaker_encoder_config_path": "",
183
+ "speaker_encoder_model_path": "",
184
+ "vocoder_path": null,
185
+ "vocoder_config_path": null
186
+ },
187
+ "return_wav": false,
188
+ "num_speakers": 2,
189
+ "speakers_file": "models/v1/kn/fastpitch/speakers.pth",
190
+ "use_speaker_embedding": true,
191
+ "use_d_vector_file": false,
192
+ "d_vector_file": "",
193
+ "d_vector_dim": 512,
194
+ "spec_loss_type": "mse",
195
+ "duration_loss_type": "mse",
196
+ "use_ssim_loss": false,
197
+ "ssim_loss_alpha": 1.0,
198
+ "spec_loss_alpha": 1.0,
199
+ "aligner_loss_alpha": 1.0,
200
+ "pitch_loss_alpha": 0.1,
201
+ "dur_loss_alpha": 0.1,
202
+ "binary_align_loss_alpha": 0.1,
203
+ "spk_encoder_loss_alpha": 0.1,
204
+ "binary_loss_warmup_epochs": 150,
205
+ "aligner_epochs": 0,
206
+ "min_seq_len": 13,
207
+ "max_seq_len": 500000,
208
+ "r": 1,
209
+ "f0_cache_path": "output_indic_fastpitch/kn/f0_cache"
210
+ }
kn_fastpitch_speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea
3
+ size 431
kn_hifigan_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b08cecc8c44078d39be4864ea173629ed56c67ebd4ac8d286988e8dade379448
3
+ size 1016384316
kn_hifigan_config.json ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "indic_vocoders",
3
+ "logger_uri": null,
4
+ "run_name": "kn_hifigan_all",
5
+ "project_name": "indic-vocoders",
6
+ "run_description": "None",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": null,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": "loss_1",
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:10007",
24
+ "mixed_precision": true,
25
+ "epochs": 5000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": [
29
+ 5,
30
+ 5
31
+ ],
32
+ "scheduler_after_epoch": true,
33
+ "lr": 0.0001,
34
+ "optimizer": "AdamW",
35
+ "optimizer_params": {
36
+ "betas": [
37
+ 0.8,
38
+ 0.99
39
+ ],
40
+ "weight_decay": 0.0
41
+ },
42
+ "lr_scheduler": null,
43
+ "lr_scheduler_params": null,
44
+ "use_grad_scaler": false,
45
+ "cudnn_enable": true,
46
+ "cudnn_deterministic": false,
47
+ "cudnn_benchmark": false,
48
+ "training_seed": 54321,
49
+ "model": "hifigan",
50
+ "num_loader_workers": 8,
51
+ "num_eval_loader_workers": 8,
52
+ "use_noise_augment": true,
53
+ "audio": {
54
+ "fft_size": 1024,
55
+ "win_length": 1024,
56
+ "hop_length": 256,
57
+ "frame_shift_ms": null,
58
+ "frame_length_ms": null,
59
+ "stft_pad_mode": "reflect",
60
+ "sample_rate": 22050,
61
+ "resample": false,
62
+ "preemphasis": 0.0,
63
+ "ref_level_db": 20,
64
+ "do_sound_norm": false,
65
+ "log_func": "np.log",
66
+ "do_trim_silence": true,
67
+ "trim_db": 60.0,
68
+ "do_rms_norm": false,
69
+ "db_level": null,
70
+ "power": 1.5,
71
+ "griffin_lim_iters": 60,
72
+ "num_mels": 80,
73
+ "mel_fmin": 0.0,
74
+ "mel_fmax": 8000,
75
+ "spec_gain": 1.0,
76
+ "do_amp_to_db_linear": true,
77
+ "do_amp_to_db_mel": true,
78
+ "pitch_fmax": 640.0,
79
+ "pitch_fmin": 0.0,
80
+ "signal_norm": false,
81
+ "min_level_db": -100,
82
+ "symmetric_norm": true,
83
+ "max_norm": 4.0,
84
+ "clip_norm": true,
85
+ "stats_path": null
86
+ },
87
+ "eval_split_size": 10,
88
+ "data_path": "../../datasets/indictts/kn",
89
+ "feature_path": null,
90
+ "seq_len": 8192,
91
+ "pad_short": 2000,
92
+ "conv_pad": 0,
93
+ "use_cache": false,
94
+ "wd": 1e-06,
95
+ "use_stft_loss": false,
96
+ "use_subband_stft_loss": false,
97
+ "use_mse_gan_loss": true,
98
+ "use_hinge_gan_loss": false,
99
+ "use_feat_match_loss": true,
100
+ "use_l1_spec_loss": true,
101
+ "stft_loss_weight": 0,
102
+ "subband_stft_loss_weight": 0,
103
+ "mse_G_loss_weight": 1,
104
+ "hinge_G_loss_weight": 0,
105
+ "feat_match_loss_weight": 108,
106
+ "l1_spec_loss_weight": 45,
107
+ "stft_loss_params": {
108
+ "n_ffts": [
109
+ 1024,
110
+ 2048,
111
+ 512
112
+ ],
113
+ "hop_lengths": [
114
+ 120,
115
+ 240,
116
+ 50
117
+ ],
118
+ "win_lengths": [
119
+ 600,
120
+ 1200,
121
+ 240
122
+ ]
123
+ },
124
+ "l1_spec_loss_params": {
125
+ "use_mel": true,
126
+ "sample_rate": 22050,
127
+ "n_fft": 1024,
128
+ "hop_length": 256,
129
+ "win_length": 1024,
130
+ "n_mels": 80,
131
+ "mel_fmin": 0.0,
132
+ "mel_fmax": null
133
+ },
134
+ "lr_gen": 0.0001,
135
+ "lr_disc": 0.0001,
136
+ "lr_scheduler_gen": "ExponentialLR",
137
+ "lr_scheduler_gen_params": {
138
+ "gamma": 0.999,
139
+ "last_epoch": -1
140
+ },
141
+ "lr_scheduler_disc": "ExponentialLR",
142
+ "lr_scheduler_disc_params": {
143
+ "gamma": 0.999,
144
+ "last_epoch": -1
145
+ },
146
+ "use_pqmf": false,
147
+ "diff_samples_for_G_and_D": false,
148
+ "discriminator_model": "hifigan_discriminator",
149
+ "generator_model": "hifigan_generator",
150
+ "generator_model_params": {
151
+ "upsample_factors": [
152
+ 8,
153
+ 8,
154
+ 2,
155
+ 2
156
+ ],
157
+ "upsample_kernel_sizes": [
158
+ 16,
159
+ 16,
160
+ 4,
161
+ 4
162
+ ],
163
+ "upsample_initial_channel": 512,
164
+ "resblock_kernel_sizes": [
165
+ 3,
166
+ 7,
167
+ 11
168
+ ],
169
+ "resblock_dilation_sizes": [
170
+ [
171
+ 1,
172
+ 3,
173
+ 5
174
+ ],
175
+ [
176
+ 1,
177
+ 3,
178
+ 5
179
+ ],
180
+ [
181
+ 1,
182
+ 3,
183
+ 5
184
+ ]
185
+ ],
186
+ "resblock_type": "1"
187
+ },
188
+ "github_branch": "* main"
189
+ }
ml_fastpitch_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:669f75b7a9f64f07fb516e62ff6c2e39ecf71e8dff9d52bd0993960eb633145a
3
+ size 637332589
ml_fastpitch_config.json ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output_indic_fastpitch/ml",
3
+ "logger_uri": null,
4
+ "run_name": "ml_fastpitch_indictts_all_align_off",
5
+ "project_name": "indic-fastpitch-stage2",
6
+ "run_description": "align_off",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": 10000,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:54321",
24
+ "mixed_precision": true,
25
+ "epochs": 1000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": 5.0,
29
+ "scheduler_after_epoch": true,
30
+ "lr": 0.0001,
31
+ "optimizer": "Adam",
32
+ "optimizer_params": {
33
+ "betas": [
34
+ 0.9,
35
+ 0.998
36
+ ],
37
+ "weight_decay": 1e-06
38
+ },
39
+ "lr_scheduler": "NoamLR",
40
+ "lr_scheduler_params": {
41
+ "warmup_steps": 4000
42
+ },
43
+ "use_grad_scaler": false,
44
+ "cudnn_enable": true,
45
+ "cudnn_deterministic": false,
46
+ "cudnn_benchmark": false,
47
+ "training_seed": 54321,
48
+ "model": "fast_pitch",
49
+ "num_loader_workers": 0,
50
+ "num_eval_loader_workers": 0,
51
+ "use_noise_augment": false,
52
+ "audio": {
53
+ "fft_size": 1024,
54
+ "win_length": 1024,
55
+ "hop_length": 256,
56
+ "frame_shift_ms": null,
57
+ "frame_length_ms": null,
58
+ "stft_pad_mode": "reflect",
59
+ "sample_rate": 22050,
60
+ "resample": false,
61
+ "preemphasis": 0.0,
62
+ "ref_level_db": 20,
63
+ "do_sound_norm": false,
64
+ "log_func": "np.log",
65
+ "do_trim_silence": true,
66
+ "trim_db": 60.0,
67
+ "do_rms_norm": false,
68
+ "db_level": null,
69
+ "power": 1.5,
70
+ "griffin_lim_iters": 60,
71
+ "num_mels": 80,
72
+ "mel_fmin": 0.0,
73
+ "mel_fmax": 8000,
74
+ "spec_gain": 1.0,
75
+ "do_amp_to_db_linear": true,
76
+ "do_amp_to_db_mel": true,
77
+ "pitch_fmax": 640.0,
78
+ "pitch_fmin": 0.0,
79
+ "signal_norm": false,
80
+ "min_level_db": -100,
81
+ "symmetric_norm": true,
82
+ "max_norm": 4.0,
83
+ "clip_norm": true,
84
+ "stats_path": null
85
+ },
86
+ "use_phonemes": false,
87
+ "phonemizer": null,
88
+ "phoneme_language": "en-us",
89
+ "compute_input_seq_cache": false,
90
+ "text_cleaner": "multilingual_cleaners",
91
+ "enable_eos_bos_chars": false,
92
+ "test_sentences_file": "",
93
+ "phoneme_cache_path": "output_indic_fastpitch/ml/phoneme_cache",
94
+ "characters": {
95
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
96
+ "vocab_dict": null,
97
+ "pad": "<PAD>",
98
+ "eos": "<EOS>",
99
+ "bos": "<BOS>",
100
+ "blank": "<BLNK>",
101
+ "characters": " ,?\u0d02\u0d03\u0d05\u0d06\u0d07\u0d08\u0d09\u0d0a\u0d0b\u0d0e\u0d0f\u0d10\u0d12\u0d13\u0d14\u0d15\u0d16\u0d17\u0d18\u0d19\u0d1a\u0d1b\u0d1c\u0d1d\u0d1e\u0d1f\u0d20\u0d21\u0d22\u0d23\u0d24\u0d25\u0d26\u0d27\u0d28\u0d2a\u0d2b\u0d2c\u0d2d\u0d2e\u0d2f\u0d30\u0d31\u0d32\u0d33\u0d34\u0d35\u0d36\u0d37\u0d38\u0d39\u0d3e\u0d3f\u0d40\u0d41\u0d42\u0d43\u0d46\u0d47\u0d48\u0d4a\u0d4b\u0d4c\u0d4d\u0d57\u0d7a\u0d7b\u0d7c\u0d7d\u0d7e",
102
+ "punctuations": "!\u00a1'(),-.:;\u00bf? ",
103
+ "phonemes": null,
104
+ "is_unique": true,
105
+ "is_sorted": true
106
+ },
107
+ "add_blank": false,
108
+ "batch_group_size": 0,
109
+ "loss_masking": null,
110
+ "sort_by_audio_len": true,
111
+ "min_audio_len": 1,
112
+ "max_audio_len": 441000,
113
+ "min_text_len": 1,
114
+ "max_text_len": 400,
115
+ "compute_f0": true,
116
+ "compute_linear_spec": false,
117
+ "precompute_num_workers": 0,
118
+ "start_by_longest": false,
119
+ "datasets": [
120
+ {
121
+ "name": "indictts",
122
+ "path": "/home/ttsteam/datasets/indictts/ml",
123
+ "meta_file_train": "metadata_train.csv",
124
+ "ignored_speakers": null,
125
+ "language": "ml",
126
+ "meta_file_val": "metadata_test.csv",
127
+ "meta_file_attn_mask": ""
128
+ }
129
+ ],
130
+ "test_sentences": [
131
+ "\u0d36\u0d3f\u0d32\u0d3e\u0d2f\u0d41\u0d17\u0d15\u0d3e\u0d32\u0d02 \u0d2e\u0d41\u0d24\u0d7d \u0d2e\u0d28\u0d41\u0d37\u0d4d\u0d2f\u0d7c \u0d1c\u0d4d\u0d2f\u0d3e\u0d2e\u0d3f\u0d24\u0d40\u0d2f \u0d30\u0d42\u0d2a\u0d19\u0d4d\u0d19\u0d7e \u0d09\u0d2a\u0d2f\u0d4b\u0d17\u0d3f\u0d1a\u0d4d\u0d1a\u0d41\u0d35\u0d30\u0d41\u0d28\u0d4d\u0d28\u0d41",
132
+ "\u0d35\u0d3e\u0d39\u0d28\u0d3e\u0d2a\u0d15\u0d1f\u0d24\u0d4d\u0d24\u0d3f\u0d7d \u0d2a\u0d30\u0d41\u0d15\u0d4d\u0d15\u0d47\u0d31\u0d4d\u0d31 \u0d05\u0d27\u0d4d\u0d2f\u0d3e\u0d2a\u0d3f\u0d15 \u0d2e\u0d30\u0d3f\u0d1a\u0d4d\u0d1a\u0d41"
133
+ ],
134
+ "eval_split_max_size": null,
135
+ "eval_split_size": 0.01,
136
+ "use_speaker_weighted_sampler": false,
137
+ "speaker_weighted_sampler_alpha": 1.0,
138
+ "use_language_weighted_sampler": false,
139
+ "language_weighted_sampler_alpha": 1.0,
140
+ "use_length_weighted_sampler": false,
141
+ "length_weighted_sampler_alpha": 1.0,
142
+ "base_model": "forward_tts",
143
+ "model_args": {
144
+ "num_chars": 88,
145
+ "out_channels": 80,
146
+ "hidden_channels": 512,
147
+ "use_aligner": true,
148
+ "use_pitch": true,
149
+ "pitch_predictor_hidden_channels": 256,
150
+ "pitch_predictor_kernel_size": 3,
151
+ "pitch_predictor_dropout_p": 0.1,
152
+ "pitch_embedding_kernel_size": 3,
153
+ "duration_predictor_hidden_channels": 256,
154
+ "duration_predictor_kernel_size": 3,
155
+ "duration_predictor_dropout_p": 0.1,
156
+ "positional_encoding": true,
157
+ "poisitonal_encoding_use_scale": true,
158
+ "length_scale": 1,
159
+ "encoder_type": "fftransformer",
160
+ "encoder_params": {
161
+ "hidden_channels_ffn": 1024,
162
+ "num_heads": 1,
163
+ "num_layers": 6,
164
+ "dropout_p": 0.1
165
+ },
166
+ "decoder_type": "fftransformer",
167
+ "decoder_params": {
168
+ "hidden_channels_ffn": 1024,
169
+ "num_heads": 1,
170
+ "num_layers": 6,
171
+ "dropout_p": 0.1
172
+ },
173
+ "detach_duration_predictor": false,
174
+ "max_duration": 75,
175
+ "num_speakers": 2,
176
+ "use_speaker_embedding": true,
177
+ "speakers_file": "models/v1/ml/fastpitch/speakers.pth",
178
+ "use_d_vector_file": false,
179
+ "d_vector_dim": 512,
180
+ "d_vector_file": null,
181
+ "use_speaker_encoder_as_loss": false,
182
+ "speaker_encoder_config_path": "",
183
+ "speaker_encoder_model_path": "",
184
+ "vocoder_path": null,
185
+ "vocoder_config_path": null
186
+ },
187
+ "return_wav": false,
188
+ "num_speakers": 2,
189
+ "speakers_file": "models/v1/ml/fastpitch/speakers.pth",
190
+ "use_speaker_embedding": true,
191
+ "use_d_vector_file": false,
192
+ "d_vector_file": "",
193
+ "d_vector_dim": 512,
194
+ "spec_loss_type": "mse",
195
+ "duration_loss_type": "mse",
196
+ "use_ssim_loss": false,
197
+ "ssim_loss_alpha": 1.0,
198
+ "spec_loss_alpha": 1.0,
199
+ "aligner_loss_alpha": 1.0,
200
+ "pitch_loss_alpha": 0.1,
201
+ "dur_loss_alpha": 0.1,
202
+ "binary_align_loss_alpha": 0.1,
203
+ "spk_encoder_loss_alpha": 0.1,
204
+ "binary_loss_warmup_epochs": 150,
205
+ "aligner_epochs": 0,
206
+ "min_seq_len": 13,
207
+ "max_seq_len": 500000,
208
+ "r": 1,
209
+ "f0_cache_path": "output_indic_fastpitch/ml/f0_cache"
210
+ }
ml_fastpitch_speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea
3
+ size 431
ml_hifigan_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aba37e31c72f088cd6d73c5782d9f89bf024a6d355158056fa398cd806cf8efc
3
+ size 1016384316
ml_hifigan_config.json ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "indic_vocoders",
3
+ "logger_uri": null,
4
+ "run_name": "ml_hifigan_all",
5
+ "project_name": "indic-vocoders",
6
+ "run_description": "None",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": null,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": "loss_1",
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:10008",
24
+ "mixed_precision": true,
25
+ "epochs": 5000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": [
29
+ 5,
30
+ 5
31
+ ],
32
+ "scheduler_after_epoch": true,
33
+ "lr": 0.0001,
34
+ "optimizer": "AdamW",
35
+ "optimizer_params": {
36
+ "betas": [
37
+ 0.8,
38
+ 0.99
39
+ ],
40
+ "weight_decay": 0.0
41
+ },
42
+ "lr_scheduler": null,
43
+ "lr_scheduler_params": null,
44
+ "use_grad_scaler": false,
45
+ "cudnn_enable": true,
46
+ "cudnn_deterministic": false,
47
+ "cudnn_benchmark": false,
48
+ "training_seed": 54321,
49
+ "model": "hifigan",
50
+ "num_loader_workers": 8,
51
+ "num_eval_loader_workers": 8,
52
+ "use_noise_augment": true,
53
+ "audio": {
54
+ "fft_size": 1024,
55
+ "win_length": 1024,
56
+ "hop_length": 256,
57
+ "frame_shift_ms": null,
58
+ "frame_length_ms": null,
59
+ "stft_pad_mode": "reflect",
60
+ "sample_rate": 22050,
61
+ "resample": false,
62
+ "preemphasis": 0.0,
63
+ "ref_level_db": 20,
64
+ "do_sound_norm": false,
65
+ "log_func": "np.log",
66
+ "do_trim_silence": true,
67
+ "trim_db": 60.0,
68
+ "do_rms_norm": false,
69
+ "db_level": null,
70
+ "power": 1.5,
71
+ "griffin_lim_iters": 60,
72
+ "num_mels": 80,
73
+ "mel_fmin": 0.0,
74
+ "mel_fmax": 8000,
75
+ "spec_gain": 1.0,
76
+ "do_amp_to_db_linear": true,
77
+ "do_amp_to_db_mel": true,
78
+ "pitch_fmax": 640.0,
79
+ "pitch_fmin": 0.0,
80
+ "signal_norm": false,
81
+ "min_level_db": -100,
82
+ "symmetric_norm": true,
83
+ "max_norm": 4.0,
84
+ "clip_norm": true,
85
+ "stats_path": null
86
+ },
87
+ "eval_split_size": 10,
88
+ "data_path": "../../datasets/indictts/ml",
89
+ "feature_path": null,
90
+ "seq_len": 8192,
91
+ "pad_short": 2000,
92
+ "conv_pad": 0,
93
+ "use_cache": false,
94
+ "wd": 1e-06,
95
+ "use_stft_loss": false,
96
+ "use_subband_stft_loss": false,
97
+ "use_mse_gan_loss": true,
98
+ "use_hinge_gan_loss": false,
99
+ "use_feat_match_loss": true,
100
+ "use_l1_spec_loss": true,
101
+ "stft_loss_weight": 0,
102
+ "subband_stft_loss_weight": 0,
103
+ "mse_G_loss_weight": 1,
104
+ "hinge_G_loss_weight": 0,
105
+ "feat_match_loss_weight": 108,
106
+ "l1_spec_loss_weight": 45,
107
+ "stft_loss_params": {
108
+ "n_ffts": [
109
+ 1024,
110
+ 2048,
111
+ 512
112
+ ],
113
+ "hop_lengths": [
114
+ 120,
115
+ 240,
116
+ 50
117
+ ],
118
+ "win_lengths": [
119
+ 600,
120
+ 1200,
121
+ 240
122
+ ]
123
+ },
124
+ "l1_spec_loss_params": {
125
+ "use_mel": true,
126
+ "sample_rate": 22050,
127
+ "n_fft": 1024,
128
+ "hop_length": 256,
129
+ "win_length": 1024,
130
+ "n_mels": 80,
131
+ "mel_fmin": 0.0,
132
+ "mel_fmax": null
133
+ },
134
+ "lr_gen": 0.0001,
135
+ "lr_disc": 0.0001,
136
+ "lr_scheduler_gen": "ExponentialLR",
137
+ "lr_scheduler_gen_params": {
138
+ "gamma": 0.999,
139
+ "last_epoch": -1
140
+ },
141
+ "lr_scheduler_disc": "ExponentialLR",
142
+ "lr_scheduler_disc_params": {
143
+ "gamma": 0.999,
144
+ "last_epoch": -1
145
+ },
146
+ "use_pqmf": false,
147
+ "diff_samples_for_G_and_D": false,
148
+ "discriminator_model": "hifigan_discriminator",
149
+ "generator_model": "hifigan_generator",
150
+ "generator_model_params": {
151
+ "upsample_factors": [
152
+ 8,
153
+ 8,
154
+ 2,
155
+ 2
156
+ ],
157
+ "upsample_kernel_sizes": [
158
+ 16,
159
+ 16,
160
+ 4,
161
+ 4
162
+ ],
163
+ "upsample_initial_channel": 512,
164
+ "resblock_kernel_sizes": [
165
+ 3,
166
+ 7,
167
+ 11
168
+ ],
169
+ "resblock_dilation_sizes": [
170
+ [
171
+ 1,
172
+ 3,
173
+ 5
174
+ ],
175
+ [
176
+ 1,
177
+ 3,
178
+ 5
179
+ ],
180
+ [
181
+ 1,
182
+ 3,
183
+ 5
184
+ ]
185
+ ],
186
+ "resblock_type": "1"
187
+ },
188
+ "github_branch": "* main"
189
+ }
mni_fastpitch_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:562a1762314ed4f5507bf8ad9ae289f0eed1eadea6637da4ba464ce5b498cada
3
+ size 637534809
mni_fastpitch_config.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output_indic_fastpitch/mni",
3
+ "logger_uri": null,
4
+ "run_name": "mni_fastpitch_indictts_all_align_off",
5
+ "project_name": "indic-fastpitch-stage2",
6
+ "run_description": "align_off",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": 10000,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:54321",
24
+ "mixed_precision": true,
25
+ "epochs": 1000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": 5.0,
29
+ "scheduler_after_epoch": true,
30
+ "lr": 0.0001,
31
+ "optimizer": "Adam",
32
+ "optimizer_params": {
33
+ "betas": [
34
+ 0.9,
35
+ 0.998
36
+ ],
37
+ "weight_decay": 1e-06
38
+ },
39
+ "lr_scheduler": "NoamLR",
40
+ "lr_scheduler_params": {
41
+ "warmup_steps": 4000
42
+ },
43
+ "lr_scheduler_aligner": "NoamLR",
44
+ "lr_scheduler_aligner_params": {
45
+ "warmup_steps": 4000
46
+ },
47
+ "use_grad_scaler": false,
48
+ "cudnn_enable": true,
49
+ "cudnn_deterministic": false,
50
+ "cudnn_benchmark": false,
51
+ "training_seed": 54321,
52
+ "model": "fast_pitch",
53
+ "num_loader_workers": 0,
54
+ "num_eval_loader_workers": 0,
55
+ "use_noise_augment": false,
56
+ "audio": {
57
+ "fft_size": 1024,
58
+ "win_length": 1024,
59
+ "hop_length": 256,
60
+ "frame_shift_ms": null,
61
+ "frame_length_ms": null,
62
+ "stft_pad_mode": "reflect",
63
+ "sample_rate": 22050,
64
+ "resample": false,
65
+ "preemphasis": 0.0,
66
+ "ref_level_db": 20,
67
+ "do_sound_norm": false,
68
+ "log_func": "np.log",
69
+ "do_trim_silence": true,
70
+ "trim_db": 60.0,
71
+ "do_rms_norm": false,
72
+ "db_level": null,
73
+ "power": 1.5,
74
+ "griffin_lim_iters": 60,
75
+ "num_mels": 80,
76
+ "mel_fmin": 0.0,
77
+ "mel_fmax": 8000,
78
+ "spec_gain": 1.0,
79
+ "do_amp_to_db_linear": true,
80
+ "do_amp_to_db_mel": true,
81
+ "pitch_fmax": 640.0,
82
+ "pitch_fmin": 0.0,
83
+ "signal_norm": false,
84
+ "min_level_db": -100,
85
+ "symmetric_norm": true,
86
+ "max_norm": 4.0,
87
+ "clip_norm": true,
88
+ "stats_path": null
89
+ },
90
+ "use_phonemes": false,
91
+ "phonemizer": null,
92
+ "phoneme_language": "en-us",
93
+ "compute_input_seq_cache": false,
94
+ "text_cleaner": "multilingual_cleaners",
95
+ "enable_eos_bos_chars": false,
96
+ "test_sentences_file": "",
97
+ "phoneme_cache_path": "output_indic_fastpitch/mni/phoneme_cache",
98
+ "characters": {
99
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
100
+ "vocab_dict": null,
101
+ "pad": "<PAD>",
102
+ "eos": "<EOS>",
103
+ "bos": "<BOS>",
104
+ "blank": "<BLNK>",
105
+ "characters": " ,-./0123456789acefghkmnoprvw\u0981\u0982\u0985\u0986\u0987\u0988\u0989\u098a\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9\u09bc\u09be\u09bf\u09c0\u09c1\u09c2\u09c3\u09c7\u09c8\u09cb\u09cc\u09cd\u09ce\u09df\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef\u09f0\u09f1\u09f7",
106
+ "punctuations": "!\u00a1'(),-.:;\u00bf? ",
107
+ "phonemes": null,
108
+ "is_unique": true,
109
+ "is_sorted": true
110
+ },
111
+ "add_blank": false,
112
+ "batch_group_size": 0,
113
+ "loss_masking": null,
114
+ "sort_by_audio_len": true,
115
+ "min_audio_len": 1,
116
+ "max_audio_len": 441000,
117
+ "min_text_len": 1,
118
+ "max_text_len": 400,
119
+ "compute_f0": true,
120
+ "compute_linear_spec": false,
121
+ "precompute_num_workers": 0,
122
+ "start_by_longest": false,
123
+ "datasets": [
124
+ {
125
+ "name": "indictts",
126
+ "path": "/home/ttsteam/datasets/indictts/mni",
127
+ "meta_file_train": "metadata_train.csv",
128
+ "ignored_speakers": null,
129
+ "language": "mni",
130
+ "meta_file_val": "metadata_test.csv",
131
+ "meta_file_attn_mask": ""
132
+ }
133
+ ],
134
+ "test_sentences": [
135
+ "\u09ae\u09a5\u0982 \u09ae\u09a5\u0982, \u0985\u09b8\u09c1\u09ae \u0995\u09be\u0996\u09bf\u09ac\u09a8\u09be.",
136
+ "\u09a5\u09c7\u09ac\u09a8\u09be \u0999\u09be\u09b6\u09bf\u0982\u09a6\u09c1 \u0985\u09ae\u09ae\u09ae\u09cd\u09a4\u09be \u0987\u09b2\u09cd\u09b2\u09c7."
137
+ ],
138
+ "eval_split_max_size": null,
139
+ "eval_split_size": 0.01,
140
+ "use_speaker_weighted_sampler": false,
141
+ "speaker_weighted_sampler_alpha": 1.0,
142
+ "use_language_weighted_sampler": false,
143
+ "language_weighted_sampler_alpha": 1.0,
144
+ "use_length_weighted_sampler": false,
145
+ "length_weighted_sampler_alpha": 1.0,
146
+ "base_model": "forward_tts",
147
+ "model_args": {
148
+ "num_chars": 114,
149
+ "out_channels": 80,
150
+ "hidden_channels": 512,
151
+ "use_aligner": true,
152
+ "use_pitch": true,
153
+ "pitch_predictor_hidden_channels": 256,
154
+ "pitch_predictor_kernel_size": 3,
155
+ "pitch_predictor_dropout_p": 0.1,
156
+ "pitch_embedding_kernel_size": 3,
157
+ "duration_predictor_hidden_channels": 256,
158
+ "duration_predictor_kernel_size": 3,
159
+ "duration_predictor_dropout_p": 0.1,
160
+ "positional_encoding": true,
161
+ "poisitonal_encoding_use_scale": true,
162
+ "length_scale": 1,
163
+ "encoder_type": "fftransformer",
164
+ "encoder_params": {
165
+ "hidden_channels_ffn": 1024,
166
+ "num_heads": 1,
167
+ "num_layers": 6,
168
+ "dropout_p": 0.1
169
+ },
170
+ "decoder_type": "fftransformer",
171
+ "decoder_params": {
172
+ "hidden_channels_ffn": 1024,
173
+ "num_heads": 1,
174
+ "num_layers": 6,
175
+ "dropout_p": 0.1
176
+ },
177
+ "detach_duration_predictor": false,
178
+ "max_duration": 75,
179
+ "num_speakers": 2,
180
+ "use_speaker_embedding": true,
181
+ "speakers_file": "models/v1/mni/fastpitch/speakers.pth",
182
+ "use_d_vector_file": false,
183
+ "d_vector_dim": 512,
184
+ "d_vector_file": null,
185
+ "use_speaker_encoder_as_loss": false,
186
+ "speaker_encoder_config_path": "",
187
+ "speaker_encoder_model_path": "",
188
+ "vocoder_path": null,
189
+ "vocoder_config_path": null,
190
+ "use_separate_optimizers": false
191
+ },
192
+ "return_wav": false,
193
+ "num_speakers": 2,
194
+ "speakers_file": "models/v1/mni/fastpitch/speakers.pth",
195
+ "use_speaker_embedding": true,
196
+ "use_d_vector_file": false,
197
+ "d_vector_file": "",
198
+ "d_vector_dim": 512,
199
+ "spec_loss_type": "mse",
200
+ "duration_loss_type": "mse",
201
+ "use_ssim_loss": false,
202
+ "ssim_loss_alpha": 1.0,
203
+ "spec_loss_alpha": 1.0,
204
+ "aligner_loss_alpha": 1.0,
205
+ "pitch_loss_alpha": 0.1,
206
+ "dur_loss_alpha": 0.1,
207
+ "binary_align_loss_alpha": 0.1,
208
+ "spk_encoder_loss_alpha": 0.1,
209
+ "binary_loss_warmup_epochs": 150,
210
+ "aligner_epochs": 0,
211
+ "min_seq_len": 13,
212
+ "max_seq_len": 500000,
213
+ "r": 1,
214
+ "f0_cache_path": "output_indic_fastpitch/mni/f0_cache"
215
+ }
mni_fastpitch_speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f665e358b34b232fb27f7c8cd3968fcd47784a7be065ae127f611c33ee809bea
3
+ size 431
mni_hifigan_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ff283526aa765ce54e31c3597850a2012a45275b1ebdbd2b9d2be8da31f61b8
3
+ size 1016366844
mni_hifigan_config.json ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "indic_vocoders",
3
+ "logger_uri": null,
4
+ "run_name": "mni_hifigan_all",
5
+ "project_name": "indic-vocoders",
6
+ "run_description": "None",
7
+ "print_step": 100,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": "indic-asr",
11
+ "dashboard_logger": "wandb",
12
+ "log_model_step": null,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 1,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": "loss_1",
19
+ "print_eval": false,
20
+ "test_delay_epochs": 0,
21
+ "run_eval": true,
22
+ "distributed_backend": "nccl",
23
+ "distributed_url": "tcp://localhost:10009",
24
+ "mixed_precision": true,
25
+ "epochs": 5000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 32,
28
+ "grad_clip": [
29
+ 5,
30
+ 5
31
+ ],
32
+ "scheduler_after_epoch": true,
33
+ "lr": 0.0001,
34
+ "optimizer": "AdamW",
35
+ "optimizer_params": {
36
+ "betas": [
37
+ 0.8,
38
+ 0.99
39
+ ],
40
+ "weight_decay": 0.0
41
+ },
42
+ "lr_scheduler": null,
43
+ "lr_scheduler_params": null,
44
+ "use_grad_scaler": false,
45
+ "cudnn_enable": true,
46
+ "cudnn_deterministic": false,
47
+ "cudnn_benchmark": false,
48
+ "training_seed": 54321,
49
+ "model": "hifigan",
50
+ "num_loader_workers": 8,
51
+ "num_eval_loader_workers": 8,
52
+ "use_noise_augment": true,
53
+ "audio": {
54
+ "fft_size": 1024,
55
+ "win_length": 1024,
56
+ "hop_length": 256,
57
+ "frame_shift_ms": null,
58
+ "frame_length_ms": null,
59
+ "stft_pad_mode": "reflect",
60
+ "sample_rate": 22050,
61
+ "resample": false,
62
+ "preemphasis": 0.0,
63
+ "ref_level_db": 20,
64
+ "do_sound_norm": false,
65
+ "log_func": "np.log",
66
+ "do_trim_silence": true,
67
+ "trim_db": 60.0,
68
+ "do_rms_norm": false,
69
+ "db_level": null,
70
+ "power": 1.5,
71
+ "griffin_lim_iters": 60,
72
+ "num_mels": 80,
73
+ "mel_fmin": 0.0,
74
+ "mel_fmax": 8000,
75
+ "spec_gain": 1.0,
76
+ "do_amp_to_db_linear": true,
77
+ "do_amp_to_db_mel": true,
78
+ "pitch_fmax": 640.0,
79
+ "pitch_fmin": 0.0,
80
+ "signal_norm": false,
81
+ "min_level_db": -100,
82
+ "symmetric_norm": true,
83
+ "max_norm": 4.0,
84
+ "clip_norm": true,
85
+ "stats_path": null
86
+ },
87
+ "eval_split_size": 10,
88
+ "data_path": "../../datasets/indictts/mni",
89
+ "feature_path": null,
90
+ "seq_len": 8192,
91
+ "pad_short": 2000,
92
+ "conv_pad": 0,
93
+ "use_cache": false,
94
+ "wd": 1e-06,
95
+ "use_stft_loss": false,
96
+ "use_subband_stft_loss": false,
97
+ "use_mse_gan_loss": true,
98
+ "use_hinge_gan_loss": false,
99
+ "use_feat_match_loss": true,
100
+ "use_l1_spec_loss": true,
101
+ "stft_loss_weight": 0,
102
+ "subband_stft_loss_weight": 0,
103
+ "mse_G_loss_weight": 1,
104
+ "hinge_G_loss_weight": 0,
105
+ "feat_match_loss_weight": 108,
106
+ "l1_spec_loss_weight": 45,
107
+ "stft_loss_params": {
108
+ "n_ffts": [
109
+ 1024,
110
+ 2048,
111
+ 512
112
+ ],
113
+ "hop_lengths": [
114
+ 120,
115
+ 240,
116
+ 50
117
+ ],
118
+ "win_lengths": [
119
+ 600,
120
+ 1200,
121
+ 240
122
+ ]
123
+ },
124
+ "l1_spec_loss_params": {
125
+ "use_mel": true,
126
+ "sample_rate": 22050,
127
+ "n_fft": 1024,
128
+ "hop_length": 256,
129
+ "win_length": 1024,
130
+ "n_mels": 80,
131
+ "mel_fmin": 0.0,
132
+ "mel_fmax": null
133
+ },
134
+ "lr_gen": 0.0001,
135
+ "lr_disc": 0.0001,
136
+ "lr_scheduler_gen": "ExponentialLR",
137
+ "lr_scheduler_gen_params": {
138
+ "gamma": 0.999,
139
+ "last_epoch": -1
140
+ },
141
+ "lr_scheduler_disc": "ExponentialLR",
142
+ "lr_scheduler_disc_params": {
143
+ "gamma": 0.999,
144
+ "last_epoch": -1
145
+ },
146
+ "use_pqmf": false,
147
+ "diff_samples_for_G_and_D": false,
148
+ "discriminator_model": "hifigan_discriminator",
149
+ "generator_model": "hifigan_generator",
150
+ "generator_model_params": {
151
+ "upsample_factors": [
152
+ 8,
153
+ 8,
154
+ 2,
155
+ 2
156
+ ],
157
+ "upsample_kernel_sizes": [
158
+ 16,
159
+ 16,
160
+ 4,
161
+ 4
162
+ ],
163
+ "upsample_initial_channel": 512,
164
+ "resblock_kernel_sizes": [
165
+ 3,
166
+ 7,
167
+ 11
168
+ ],
169
+ "resblock_dilation_sizes": [
170
+ [
171
+ 1,
172
+ 3,
173
+ 5
174
+ ],
175
+ [
176
+ 1,
177
+ 3,
178
+ 5
179
+ ],
180
+ [
181
+ 1,
182
+ 3,
183
+ 5
184
+ ]
185
+ ],
186
+ "resblock_type": "1"
187
+ },
188
+ "github_branch": "* main"
189
+ }