Commit
·
0027665
verified
·
0
Parent(s):
Duplicate from IndexTeam/IndexTTS-1.5
Browse filesCo-authored-by: sjc <[email protected]>
- .gitattributes +35 -0
- README +5 -0
- README.md +3 -0
- bigvgan_discriminator.pth +3 -0
- bigvgan_generator.pth +3 -0
- bpe.model +3 -0
- config.yaml +113 -0
- dvae.pth +3 -0
- gpt.pth +3 -0
- unigram_12000.vocab +0 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
大更新(效果很不错):
|
2 |
+
1. 大幅增加了英文训练数据,提升英文及跨语种合成效果;
|
3 |
+
2. 增大模型参数至0.5B左右;
|
4 |
+
3. wer, ss 及 韵律都有明显的提升;
|
5 |
+
4. gpt输出:text token 和 mel token 是连在一起的。
|
README.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
---
|
bigvgan_discriminator.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:46e1f6277f7239363d2393f2f9fe36902cf8995e4acc0ba67ed25a025dbd02f0
|
3 |
+
size 1651507545
|
bigvgan_generator.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2458834d8277e76eb8614c9751b5e8eaa0474eab706f0ecfafcb600023133ed
|
3 |
+
size 536176992
|
bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2a5ce8090d32da3642cc4f81fdc996376bc6dd3f4cd5e3d165f71120d9f2bc8
|
3 |
+
size 475997
|
config.yaml
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset:
|
2 |
+
bpe_model: bpe.model
|
3 |
+
sample_rate: 24000
|
4 |
+
squeeze: false
|
5 |
+
mel:
|
6 |
+
sample_rate: 24000
|
7 |
+
n_fft: 1024
|
8 |
+
hop_length: 256
|
9 |
+
win_length: 1024
|
10 |
+
n_mels: 100
|
11 |
+
mel_fmin: 0
|
12 |
+
normalize: false
|
13 |
+
|
14 |
+
gpt:
|
15 |
+
model_dim: 1280
|
16 |
+
max_mel_tokens: 800
|
17 |
+
max_text_tokens: 600
|
18 |
+
heads: 20
|
19 |
+
use_mel_codes_as_input: true
|
20 |
+
mel_length_compression: 1024
|
21 |
+
layers: 24
|
22 |
+
number_text_tokens: 12000
|
23 |
+
number_mel_codes: 8194
|
24 |
+
start_mel_token: 8192
|
25 |
+
stop_mel_token: 8193
|
26 |
+
start_text_token: 0
|
27 |
+
stop_text_token: 1
|
28 |
+
train_solo_embeddings: false
|
29 |
+
condition_type: "conformer_perceiver"
|
30 |
+
condition_module:
|
31 |
+
output_size: 512
|
32 |
+
linear_units: 2048
|
33 |
+
attention_heads: 8
|
34 |
+
num_blocks: 6
|
35 |
+
input_layer: "conv2d2"
|
36 |
+
perceiver_mult: 2
|
37 |
+
|
38 |
+
vqvae:
|
39 |
+
channels: 100
|
40 |
+
num_tokens: 8192
|
41 |
+
hidden_dim: 512
|
42 |
+
num_resnet_blocks: 3
|
43 |
+
codebook_dim: 512
|
44 |
+
num_layers: 2
|
45 |
+
positional_dims: 1
|
46 |
+
kernel_size: 3
|
47 |
+
smooth_l1_loss: true
|
48 |
+
use_transposed_convs: false
|
49 |
+
|
50 |
+
bigvgan:
|
51 |
+
adam_b1: 0.8
|
52 |
+
adam_b2: 0.99
|
53 |
+
lr_decay: 0.999998
|
54 |
+
seed: 1234
|
55 |
+
|
56 |
+
resblock: "1"
|
57 |
+
upsample_rates: [4,4,4,4,2,2]
|
58 |
+
upsample_kernel_sizes: [8,8,4,4,4,4]
|
59 |
+
upsample_initial_channel: 1536
|
60 |
+
resblock_kernel_sizes: [3,7,11]
|
61 |
+
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
62 |
+
feat_upsample: false
|
63 |
+
speaker_embedding_dim: 512
|
64 |
+
cond_d_vector_in_each_upsampling_layer: true
|
65 |
+
|
66 |
+
gpt_dim: 1280
|
67 |
+
|
68 |
+
activation: "snakebeta"
|
69 |
+
snake_logscale: true
|
70 |
+
|
71 |
+
use_cqtd_instead_of_mrd: true
|
72 |
+
cqtd_filters: 128
|
73 |
+
cqtd_max_filters: 1024
|
74 |
+
cqtd_filters_scale: 1
|
75 |
+
cqtd_dilations: [1, 2, 4]
|
76 |
+
cqtd_hop_lengths: [512, 256, 256]
|
77 |
+
cqtd_n_octaves: [9, 9, 9]
|
78 |
+
cqtd_bins_per_octaves: [24, 36, 48]
|
79 |
+
|
80 |
+
resolutions: [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]]
|
81 |
+
mpd_reshapes: [2, 3, 5, 7, 11]
|
82 |
+
use_spectral_norm: false
|
83 |
+
discriminator_channel_mult: 1
|
84 |
+
|
85 |
+
use_multiscale_melloss: true
|
86 |
+
lambda_melloss: 15
|
87 |
+
|
88 |
+
clip_grad_norm: 1000
|
89 |
+
|
90 |
+
segment_size: 16384
|
91 |
+
num_mels: 100
|
92 |
+
num_freq: 1025
|
93 |
+
n_fft: 1024
|
94 |
+
hop_size: 256
|
95 |
+
win_size: 1024
|
96 |
+
|
97 |
+
sampling_rate: 24000
|
98 |
+
|
99 |
+
fmin: 0
|
100 |
+
fmax: null
|
101 |
+
fmax_for_loss: null
|
102 |
+
mel_type: "pytorch"
|
103 |
+
|
104 |
+
num_workers: 2
|
105 |
+
dist_config:
|
106 |
+
dist_backend: "nccl"
|
107 |
+
dist_url: "tcp://localhost:54321"
|
108 |
+
world_size: 1
|
109 |
+
|
110 |
+
dvae_checkpoint: dvae.pth
|
111 |
+
gpt_checkpoint: gpt.pth
|
112 |
+
bigvgan_checkpoint: bigvgan_generator.pth
|
113 |
+
version: 1.5
|
dvae.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:69e841bf8cd97a32806ea8a439c50017c991ac9e8bb795db89ec47828cae4d5d
|
3 |
+
size 243316270
|
gpt.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:44460b820a8afd58f68f3d3e69113e7900c8730bf519ecf158c081f2b8991240
|
3 |
+
size 1171228980
|
unigram_12000.vocab
ADDED
The diff for this file is too large to render.
See raw diff
|
|