TangRain commited on
Commit
ebb7aaa
·
1 Parent(s): 53b5e2a
README.md CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: cc-by-4.0
3
  ---
 
1
+ Model Arch: TokSing
2
+
3
+ Dataste: Opencpop
4
+
5
+ You can run this model with espnet/svs2.
6
+
7
+ ```sh
8
+ ./run.sh --stage 8 \
9
+ --kmeans_feature "multi/hubert_large_6+wavlm_large_6+wavlm_large_23" \
10
+ --multi_token "hubert_large_ll60k_128_6 wavlm_large_128_6 wavlm_large_128_23" \
11
+ --inference_model 300epoch.pth \
12
+ --vocoder_file {vocoder_path}
13
+ ```
14
+
15
  ---
16
  license: cc-by-4.0
17
  ---
exp/svs_stats_raw_phn_none_zh/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55ee5850cc8928c2849db7fd96af9269d1b6f6bf6e781192c566733deefe0910
3
+ size 1402
exp/svs_stats_raw_phn_none_zh/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af7755bf050318d1edf0bce559fc7f019c925d6376a9a1101d69348d0cea725f
3
+ size 770
exp/svs_train_toksing_raw_phn_none_zh/300epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f094124cc67c1110a4b24b86d1b5252c97fcc96de9d2e28bb00e88c5ad9f390d
3
+ size 281020670
exp/svs_train_toksing_raw_phn_none_zh/config.yaml ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_toksing.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/svs_train_toksing_raw_phn_none_zh
9
+ ngpu: 1
10
+ seed: 0
11
+ num_workers: 10
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ gradient_as_bucket_view: true
27
+ ddp_comm_hook: null
28
+ cudnn_enabled: true
29
+ cudnn_benchmark: false
30
+ cudnn_deterministic: true
31
+ use_tf32: false
32
+ collect_stats: false
33
+ write_collected_feats: false
34
+ max_epoch: 300
35
+ patience: null
36
+ val_scheduler_criterion:
37
+ - valid
38
+ - loss
39
+ early_stopping_criterion:
40
+ - valid
41
+ - loss
42
+ - min
43
+ best_model_criterion:
44
+ - - valid
45
+ - loss
46
+ - min
47
+ - - train
48
+ - loss
49
+ - min
50
+ keep_nbest_models: 5
51
+ nbest_averaging_interval: 0
52
+ grad_clip: 1.0
53
+ grad_clip_type: 2.0
54
+ grad_noise: false
55
+ accum_grad: 1
56
+ no_forward_run: false
57
+ resume: true
58
+ train_dtype: float32
59
+ use_amp: false
60
+ log_interval: null
61
+ use_matplotlib: true
62
+ use_tensorboard: true
63
+ create_graph_in_tensorboard: false
64
+ use_wandb: false
65
+ wandb_project: null
66
+ wandb_id: null
67
+ wandb_entity: null
68
+ wandb_name: null
69
+ wandb_model_log_interval: -1
70
+ detect_anomaly: false
71
+ use_adapter: false
72
+ adapter: lora
73
+ save_strategy: all
74
+ adapter_conf: {}
75
+ pretrain_path: null
76
+ init_param: []
77
+ ignore_init_mismatch: false
78
+ freeze_param: []
79
+ num_iters_per_epoch: null
80
+ batch_size: 32
81
+ valid_batch_size: null
82
+ batch_bins: 1000000
83
+ valid_batch_bins: null
84
+ category_sample_size: 10
85
+ train_shape_file:
86
+ - exp/svs_stats_raw_phn_none_zh/train/text_shape.phn
87
+ - exp/svs_stats_raw_phn_none_zh/train/singing_shape
88
+ valid_shape_file:
89
+ - exp/svs_stats_raw_phn_none_zh/valid/text_shape.phn
90
+ - exp/svs_stats_raw_phn_none_zh/valid/singing_shape
91
+ batch_type: sorted
92
+ valid_batch_type: null
93
+ fold_length:
94
+ - 150
95
+ - 256000
96
+ sort_in_batch: descending
97
+ shuffle_within_batch: false
98
+ sort_batch: descending
99
+ multiple_iterator: false
100
+ chunk_length: 500
101
+ chunk_shift_ratio: 0.5
102
+ num_cache_chunks: 1024
103
+ chunk_excluded_key_prefixes: []
104
+ chunk_default_fs: null
105
+ chunk_max_abs_length: null
106
+ chunk_discard_short_samples: true
107
+ train_data_path_and_name_and_type:
108
+ - - dump/raw/tr_no_dev/text
109
+ - text
110
+ - text
111
+ - - dump/raw/tr_no_dev/wav.scp
112
+ - singing
113
+ - sound
114
+ - - dump/raw/tr_no_dev/label
115
+ - label
116
+ - duration
117
+ - - dump/raw/tr_no_dev/score.scp
118
+ - score
119
+ - score
120
+ - - dump/raw/tr_no_dev/token_multi_hubert_large_6+wavlm_large_6+wavlm_large_23
121
+ - discrete_token
122
+ - text_int
123
+ valid_data_path_and_name_and_type:
124
+ - - dump/raw/dev/text
125
+ - text
126
+ - text
127
+ - - dump/raw/dev/wav.scp
128
+ - singing
129
+ - sound
130
+ - - dump/raw/dev/label
131
+ - label
132
+ - duration
133
+ - - dump/raw/dev/score.scp
134
+ - score
135
+ - score
136
+ - - dump/raw/dev/token_multi_hubert_large_6+wavlm_large_6+wavlm_large_23
137
+ - discrete_token
138
+ - text_int
139
+ multi_task_dataset: false
140
+ allow_variable_data_keys: false
141
+ max_cache_size: 0.0
142
+ max_cache_fd: 32
143
+ allow_multi_rates: false
144
+ valid_max_cache_size: null
145
+ exclude_weight_decay: false
146
+ exclude_weight_decay_conf: {}
147
+ optim: adam
148
+ optim_conf:
149
+ lr: 0.001
150
+ eps: 1.0e-06
151
+ weight_decay: 0.0
152
+ scheduler: null
153
+ scheduler_conf: {}
154
+ token_list:
155
+ - <blank>
156
+ - <unk>
157
+ - SP
158
+ - i
159
+ - AP
160
+ - e
161
+ - y
162
+ - d
163
+ - w
164
+ - sh
165
+ - ai
166
+ - n
167
+ - x
168
+ - j
169
+ - ian
170
+ - u
171
+ - l
172
+ - h
173
+ - b
174
+ - o
175
+ - zh
176
+ - an
177
+ - ou
178
+ - m
179
+ - q
180
+ - z
181
+ - en
182
+ - g
183
+ - ing
184
+ - ei
185
+ - ao
186
+ - ang
187
+ - uo
188
+ - eng
189
+ - t
190
+ - a
191
+ - ong
192
+ - ui
193
+ - k
194
+ - f
195
+ - r
196
+ - iang
197
+ - ch
198
+ - v
199
+ - in
200
+ - iao
201
+ - ie
202
+ - iu
203
+ - c
204
+ - s
205
+ - van
206
+ - p
207
+ - ve
208
+ - uan
209
+ - uang
210
+ - ia
211
+ - ua
212
+ - uai
213
+ - un
214
+ - er
215
+ - vn
216
+ - iong
217
+ - <sos/eos>
218
+ odim: null
219
+ model_conf: {}
220
+ use_preprocessor: true
221
+ token_type: phn
222
+ bpemodel: null
223
+ non_linguistic_symbols: null
224
+ cleaner: null
225
+ g2p: null
226
+ fs: 16000
227
+ discrete_token_layers: 3
228
+ nclusters: 128
229
+ score_feats_extract: syllable_score_feats
230
+ score_feats_extract_conf:
231
+ fs: 16000
232
+ n_fft: 2048
233
+ win_length: 1280
234
+ hop_length: 320
235
+ feats_extract: fbank
236
+ feats_extract_conf:
237
+ n_fft: 2048
238
+ hop_length: 320
239
+ win_length: 1280
240
+ fs: 16000
241
+ fmin: 80
242
+ fmax: 7600
243
+ n_mels: 80
244
+ normalize: global_mvn
245
+ normalize_conf:
246
+ stats_file: exp/svs_stats_raw_phn_none_zh/train/feats_stats.npz
247
+ svs: toksing
248
+ svs_conf:
249
+ midi_dim: 129
250
+ duration_dim: 512
251
+ adim: 384
252
+ aheads: 4
253
+ elayers: 6
254
+ eunits: 1536
255
+ dlayers: 6
256
+ dunits: 1536
257
+ postnet_layers: 0
258
+ postnet_chans: 512
259
+ postnet_filts: 5
260
+ postnet_dropout_rate: 0.5
261
+ use_batch_norm: true
262
+ reduction_factor: 1
263
+ global_channels: -1
264
+ text_encoder_attention_heads: 2
265
+ text_encoder_ffn_expand: 4
266
+ text_encoder_blocks: 6
267
+ text_encoder_positionwise_layer_type: conv1d
268
+ text_encoder_positionwise_conv_kernel_size: 3
269
+ text_encoder_positional_encoding_layer_type: rel_pos
270
+ text_encoder_self_attention_layer_type: rel_selfattn
271
+ text_encoder_activation_type: swish
272
+ text_encoder_normalize_before: true
273
+ text_encoder_dropout_rate: 0.1
274
+ text_encoder_positional_dropout_rate: 0.0
275
+ text_encoder_attention_dropout_rate: 0.1
276
+ use_macaron_style_in_text_encoder: true
277
+ use_conformer_conv_in_text_encoder: false
278
+ text_encoder_conformer_kernel_size: -1
279
+ init_type: pytorch
280
+ use_masking: true
281
+ loss_function: FastSpeech1
282
+ loss_type: L1
283
+ lambda_out: 1
284
+ lambda_dur: 1
285
+ lambda_pitch: 1
286
+ lambda_vuv: 0.01
287
+ use_discrete_token: true
288
+ predict_pitch: true
289
+ codec_codebook: 0
290
+ pitch_extract: dio
291
+ pitch_extract_conf:
292
+ use_token_averaged_f0: false
293
+ use_log_f0: true
294
+ fs: 16000
295
+ n_fft: 2048
296
+ hop_length: 320
297
+ f0max: 800
298
+ f0min: 80
299
+ reduction_factor: 1
300
+ pitch_normalize: null
301
+ pitch_normalize_conf:
302
+ stats_file: exp/svs_stats_raw_phn_none_zh/train/pitch_stats.npz
303
+ ying_extract: null
304
+ ying_extract_conf: {}
305
+ energy_extract: null
306
+ energy_extract_conf: {}
307
+ energy_normalize: null
308
+ energy_normalize_conf: {}
309
+ model_type: discrete_svs
310
+ model_type_conf: {}
311
+ required:
312
+ - output_dir
313
+ - token_list
314
+ version: '202503'
315
+ distributed: false
exp/svs_train_toksing_raw_phn_none_zh/images/acc.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/backward_time.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/clip.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/decoder_alpha.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/duration_loss.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/encoder_alpha.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/forward_time.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/gpu_max_cached_mem_GB.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/grad_norm.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/iter_time.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/loss.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/loss_scale.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/optim0_lr0.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/optim_step_time.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/out_loss.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/pitch_loss.png ADDED
exp/svs_train_toksing_raw_phn_none_zh/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202503'
2
+ files:
3
+ model_file: exp/svs_train_toksing_raw_phn_none_zh/300epoch.pth
4
+ python: 3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]
5
+ timestamp: 1750092456.204741
6
+ torch: 2.5.1+cu124
7
+ yaml_files:
8
+ train_config: exp/svs_train_toksing_raw_phn_none_zh/config.yaml
vocoder/checkpoint-250000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53f74e4ccfa1a404a7d8661f1dade29f1a12fa86af5f215280f7664a9e26f867
3
+ size 1091678223
vocoder/config.yml ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ allow_cache: true
2
+ batch_max_steps: 10240
3
+ batch_size: 16
4
+ config: conf/hifigan_token_16k_nodp_f0.v1.yaml
5
+ dev_dumpdir: dump_token/dump_hb_6+wl_6+wl_23_f0/dev/raw
6
+ dev_feats_scp: null
7
+ dev_segments: null
8
+ dev_wav_scp: null
9
+ discriminator_adv_loss_params:
10
+ average_by_discriminators: false
11
+ discriminator_grad_norm: -1
12
+ discriminator_optimizer_params:
13
+ betas:
14
+ - 0.5
15
+ - 0.9
16
+ lr: 0.0002
17
+ weight_decay: 0.0
18
+ discriminator_optimizer_type: Adam
19
+ discriminator_params:
20
+ follow_official_norm: true
21
+ period_discriminator_params:
22
+ bias: true
23
+ channels: 32
24
+ downsample_scales:
25
+ - 3
26
+ - 3
27
+ - 3
28
+ - 3
29
+ - 1
30
+ in_channels: 1
31
+ kernel_sizes:
32
+ - 5
33
+ - 3
34
+ max_downsample_channels: 1024
35
+ nonlinear_activation: LeakyReLU
36
+ nonlinear_activation_params:
37
+ negative_slope: 0.1
38
+ out_channels: 1
39
+ use_spectral_norm: false
40
+ use_weight_norm: true
41
+ periods:
42
+ - 2
43
+ - 3
44
+ - 5
45
+ - 7
46
+ - 11
47
+ scale_discriminator_params:
48
+ bias: true
49
+ channels: 128
50
+ downsample_scales:
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 1
56
+ in_channels: 1
57
+ kernel_sizes:
58
+ - 15
59
+ - 41
60
+ - 5
61
+ - 3
62
+ max_downsample_channels: 1024
63
+ max_groups: 16
64
+ nonlinear_activation: LeakyReLU
65
+ nonlinear_activation_params:
66
+ negative_slope: 0.1
67
+ out_channels: 1
68
+ scale_downsample_pooling: AvgPool1d
69
+ scale_downsample_pooling_params:
70
+ kernel_size: 4
71
+ padding: 2
72
+ stride: 2
73
+ scales: 3
74
+ discriminator_scheduler_params:
75
+ gamma: 0.5
76
+ milestones:
77
+ - 200000
78
+ - 400000
79
+ - 600000
80
+ - 800000
81
+ discriminator_scheduler_type: MultiStepLR
82
+ discriminator_train_start_steps: 0
83
+ discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
84
+ distributed: false
85
+ duration_loss_params:
86
+ offset: 1.0
87
+ reduction: mean
88
+ eval_interval_steps: 10000
89
+ feat_match_loss_params:
90
+ average_by_discriminators: false
91
+ average_by_layers: false
92
+ include_final_outputs: true
93
+ fft_size: null
94
+ fmax: null
95
+ fmin: null
96
+ format: hdf5
97
+ generator_adv_loss_params:
98
+ average_by_discriminators: false
99
+ generator_grad_norm: -1
100
+ generator_optimizer_params:
101
+ betas:
102
+ - 0.5
103
+ - 0.9
104
+ lr: 0.0002
105
+ weight_decay: 0.0
106
+ generator_optimizer_type: Adam
107
+ generator_params:
108
+ bias: true
109
+ channels: 512
110
+ in_channels: 768
111
+ kernel_size: 7
112
+ layer_num: 3
113
+ linear_channel: 256
114
+ nonlinear_activation: LeakyReLU
115
+ nonlinear_activation_params:
116
+ negative_slope: 0.1
117
+ num_embs: 1025
118
+ num_spk_embs: 0
119
+ out_channels: 1
120
+ resblock_dilations:
121
+ - - 1
122
+ - 3
123
+ - 5
124
+ - - 1
125
+ - 3
126
+ - 5
127
+ - - 1
128
+ - 3
129
+ - 5
130
+ resblock_kernel_sizes:
131
+ - 3
132
+ - 7
133
+ - 11
134
+ upsample_kernel_sizes:
135
+ - 20
136
+ - 16
137
+ - 4
138
+ - 4
139
+ upsample_scales:
140
+ - 10
141
+ - 8
142
+ - 2
143
+ - 2
144
+ use_additional_convs: true
145
+ use_embedding_feats: false
146
+ use_f0: true
147
+ use_fix_weight: false
148
+ use_weight_norm: true
149
+ use_weight_sum: true
150
+ generator_scheduler_params:
151
+ gamma: 0.5
152
+ milestones:
153
+ - 200000
154
+ - 400000
155
+ - 600000
156
+ - 800000
157
+ generator_scheduler_type: MultiStepLR
158
+ generator_train_start_steps: 1
159
+ generator_type: DiscreteSymbolF0Generator
160
+ global_gain_scale: 1.0
161
+ hop_size: 320
162
+ lambda_adv: 1.0
163
+ lambda_aux: 45.0
164
+ lambda_feat_match: 2.0
165
+ log_interval_steps: 100
166
+ mel_loss_params:
167
+ fft_size: 1024
168
+ fmax: 8000
169
+ fmin: 0
170
+ fs: 16000
171
+ hop_size: 256
172
+ log_base: null
173
+ num_mels: 80
174
+ win_length: null
175
+ window: hann
176
+ num_mels: 1
177
+ num_save_intermediate_results: 4
178
+ num_workers: 0
179
+ outdir: exp/hb_6+wl_6+wl_23_f0/train_opencpop_hifigan_token_16k_nodp_f0.v1
180
+ pin_memory: true
181
+ pretrain: ''
182
+ rank: 0
183
+ remove_short_samples: false
184
+ resume: exp/hb_6+wl_6+wl_23_f0/train_opencpop_hifigan_token_16k_nodp_f0.v1/checkpoint-27steps.pkl
185
+ sampling_rate: 16000
186
+ save_interval_steps: 50000
187
+ train_dumpdir: dump_token/dump_hb_6+wl_6+wl_23_f0/train/raw
188
+ train_feats_scp: null
189
+ train_max_steps: 250000
190
+ train_segments: null
191
+ train_wav_scp: null
192
+ trim_frame_size: 1024
193
+ trim_hop_size: 256
194
+ trim_silence: false
195
+ trim_threshold_in_db: 20
196
+ use_duration_loss: true
197
+ use_f0: true
198
+ use_feat_match_loss: true
199
+ use_mel_loss: true
200
+ use_multi_resolution_token: false
201
+ use_stft_loss: false
202
+ verbose: 1
203
+ version: 0.6.2a
204
+ win_length: null
205
+ window: null