RedbeardNZ jackshu commited on
Commit
0027665
·
verified ·
0 Parent(s):

Duplicate from IndexTeam/IndexTTS-1.5

Browse files

Co-authored-by: sjc <[email protected]>

Files changed (10) hide show
  1. .gitattributes +35 -0
  2. README +5 -0
  3. README.md +3 -0
  4. bigvgan_discriminator.pth +3 -0
  5. bigvgan_generator.pth +3 -0
  6. bpe.model +3 -0
  7. config.yaml +113 -0
  8. dvae.pth +3 -0
  9. gpt.pth +3 -0
  10. unigram_12000.vocab +0 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 大更新(效果很不错):
2
+ 1. 大幅增加了英文训练数据,提升英文及跨语种合成效果;
3
+ 2. 增大模型参数至0.5B左右;
4
+ 3. wer, ss 及 韵律都有明显的提升;
5
+ 4. gpt输出:text token 和 mel token 是连在一起的。
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
bigvgan_discriminator.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46e1f6277f7239363d2393f2f9fe36902cf8995e4acc0ba67ed25a025dbd02f0
3
+ size 1651507545
bigvgan_generator.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2458834d8277e76eb8614c9751b5e8eaa0474eab706f0ecfafcb600023133ed
3
+ size 536176992
bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2a5ce8090d32da3642cc4f81fdc996376bc6dd3f4cd5e3d165f71120d9f2bc8
3
+ size 475997
config.yaml ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ bpe_model: bpe.model
3
+ sample_rate: 24000
4
+ squeeze: false
5
+ mel:
6
+ sample_rate: 24000
7
+ n_fft: 1024
8
+ hop_length: 256
9
+ win_length: 1024
10
+ n_mels: 100
11
+ mel_fmin: 0
12
+ normalize: false
13
+
14
+ gpt:
15
+ model_dim: 1280
16
+ max_mel_tokens: 800
17
+ max_text_tokens: 600
18
+ heads: 20
19
+ use_mel_codes_as_input: true
20
+ mel_length_compression: 1024
21
+ layers: 24
22
+ number_text_tokens: 12000
23
+ number_mel_codes: 8194
24
+ start_mel_token: 8192
25
+ stop_mel_token: 8193
26
+ start_text_token: 0
27
+ stop_text_token: 1
28
+ train_solo_embeddings: false
29
+ condition_type: "conformer_perceiver"
30
+ condition_module:
31
+ output_size: 512
32
+ linear_units: 2048
33
+ attention_heads: 8
34
+ num_blocks: 6
35
+ input_layer: "conv2d2"
36
+ perceiver_mult: 2
37
+
38
+ vqvae:
39
+ channels: 100
40
+ num_tokens: 8192
41
+ hidden_dim: 512
42
+ num_resnet_blocks: 3
43
+ codebook_dim: 512
44
+ num_layers: 2
45
+ positional_dims: 1
46
+ kernel_size: 3
47
+ smooth_l1_loss: true
48
+ use_transposed_convs: false
49
+
50
+ bigvgan:
51
+ adam_b1: 0.8
52
+ adam_b2: 0.99
53
+ lr_decay: 0.999998
54
+ seed: 1234
55
+
56
+ resblock: "1"
57
+ upsample_rates: [4,4,4,4,2,2]
58
+ upsample_kernel_sizes: [8,8,4,4,4,4]
59
+ upsample_initial_channel: 1536
60
+ resblock_kernel_sizes: [3,7,11]
61
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
62
+ feat_upsample: false
63
+ speaker_embedding_dim: 512
64
+ cond_d_vector_in_each_upsampling_layer: true
65
+
66
+ gpt_dim: 1280
67
+
68
+ activation: "snakebeta"
69
+ snake_logscale: true
70
+
71
+ use_cqtd_instead_of_mrd: true
72
+ cqtd_filters: 128
73
+ cqtd_max_filters: 1024
74
+ cqtd_filters_scale: 1
75
+ cqtd_dilations: [1, 2, 4]
76
+ cqtd_hop_lengths: [512, 256, 256]
77
+ cqtd_n_octaves: [9, 9, 9]
78
+ cqtd_bins_per_octaves: [24, 36, 48]
79
+
80
+ resolutions: [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]]
81
+ mpd_reshapes: [2, 3, 5, 7, 11]
82
+ use_spectral_norm: false
83
+ discriminator_channel_mult: 1
84
+
85
+ use_multiscale_melloss: true
86
+ lambda_melloss: 15
87
+
88
+ clip_grad_norm: 1000
89
+
90
+ segment_size: 16384
91
+ num_mels: 100
92
+ num_freq: 1025
93
+ n_fft: 1024
94
+ hop_size: 256
95
+ win_size: 1024
96
+
97
+ sampling_rate: 24000
98
+
99
+ fmin: 0
100
+ fmax: null
101
+ fmax_for_loss: null
102
+ mel_type: "pytorch"
103
+
104
+ num_workers: 2
105
+ dist_config:
106
+ dist_backend: "nccl"
107
+ dist_url: "tcp://localhost:54321"
108
+ world_size: 1
109
+
110
+ dvae_checkpoint: dvae.pth
111
+ gpt_checkpoint: gpt.pth
112
+ bigvgan_checkpoint: bigvgan_generator.pth
113
+ version: 1.5
dvae.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69e841bf8cd97a32806ea8a439c50017c991ac9e8bb795db89ec47828cae4d5d
3
+ size 243316270
gpt.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44460b820a8afd58f68f3d3e69113e7900c8730bf519ecf158c081f2b8991240
3
+ size 1171228980
unigram_12000.vocab ADDED
The diff for this file is too large to render. See raw diff