add 0528 model
Browse files- README.md +11 -0
- asset/DVAE.pt +3 -0
- asset/Decoder.pt +3 -0
- asset/GPT.pt +3 -0
- asset/Vocos.pt +3 -0
- asset/tokenizer.pt +3 -0
- config/decoder.yaml +12 -0
- config/dvae.yaml +14 -0
- config/gpt.yaml +20 -0
- config/path.yaml +11 -0
- config/vocos.yaml +24 -0
README.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: cc-by-nc-4.0
|
3 |
+
---
|
4 |
+
|
5 |
+
Disclaimer: For Academic Purposes Only
|
6 |
+
|
7 |
+
The information provided in this document is for academic purposes only. It is intended for educational and research use, and should not be used for any commercial or legal purposes. The authors do not guarantee the accuracy, completeness, or reliability of the information.
|
8 |
+
|
9 |
+
免责声明:仅供学术交流
|
10 |
+
|
11 |
+
本文件中的信息仅供学术交流使用。其目的是用于教育和研究,不得用于任何商业或法律目的。作者不保证信息的准确性、完整性或可靠性。
|
asset/DVAE.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:613cb128adf89188c93ea5880ea0b798e66b1fe6186d0c535d99bcd87bfd6976
|
3 |
+
size 27749823
|
asset/Decoder.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9964e36e840f0e3a748c5f716fe6de6490d2135a5f5155f4a642d51860e2ec38
|
3 |
+
size 103718156
|
asset/GPT.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d7d4ee6461ea097a2be23eb40d73fb94ad3b3d39cb64fbb50cb3357fd466cadb
|
3 |
+
size 900746442
|
asset/Vocos.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:09a670eda1c08b740013679c7a90ebb7f1a97646ea7673069a6838e6b51d6c58
|
3 |
+
size 54363119
|
asset/tokenizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e911ae7c6a7c27953433f35c44227a67838fe229a1f428503bdb6cd3d1bcc69c
|
3 |
+
size 336680
|
config/decoder.yaml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
dim: 384
|
4 |
+
|
5 |
+
decoder_config:
|
6 |
+
idim: ${dim}
|
7 |
+
odim: ${dim}
|
8 |
+
hidden: 512
|
9 |
+
n_layer: 12
|
10 |
+
bn_dim: 128
|
11 |
+
|
12 |
+
vq_config: null
|
config/dvae.yaml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
dim: 512
|
4 |
+
decoder_config:
|
5 |
+
idim: ${dim}
|
6 |
+
odim: ${dim}
|
7 |
+
n_layer: 12
|
8 |
+
bn_dim: 128
|
9 |
+
|
10 |
+
vq_config:
|
11 |
+
dim: 1024
|
12 |
+
levels: [5,5,5,5]
|
13 |
+
G: 2
|
14 |
+
R: 2
|
config/gpt.yaml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
num_audio_tokens: 626
|
4 |
+
num_text_tokens: 21178
|
5 |
+
|
6 |
+
gpt_config:
|
7 |
+
hidden_size: 768
|
8 |
+
intermediate_size: 3072
|
9 |
+
num_attention_heads: 12
|
10 |
+
num_hidden_layers: 20
|
11 |
+
use_cache: False
|
12 |
+
max_position_embeddings: 4096
|
13 |
+
# attn_implementation: flash_attention_2
|
14 |
+
|
15 |
+
spk_emb_dim: 192
|
16 |
+
spk_KL: False
|
17 |
+
num_audio_tokens: 626
|
18 |
+
num_text_tokens: null
|
19 |
+
num_vq: 4
|
20 |
+
|
config/path.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
vocos_config_path: config/vocos.yaml
|
4 |
+
vocos_ckpt_path: asset/Vocos.pt
|
5 |
+
dvae_config_path: config/dvae.yaml
|
6 |
+
dvae_ckpt_path: asset/DVAE.pt
|
7 |
+
gpt_config_path: config/gpt.yaml
|
8 |
+
gpt_ckpt_path: asset/GPT.pt
|
9 |
+
decoder_config_path: config/decoder.yaml
|
10 |
+
decoder_ckpt_path: asset/Decoder.pt
|
11 |
+
tokenizer_path: asset/tokenizer.pt
|
config/vocos.yaml
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
feature_extractor:
|
2 |
+
class_path: vocos.feature_extractors.MelSpectrogramFeatures
|
3 |
+
init_args:
|
4 |
+
sample_rate: 24000
|
5 |
+
n_fft: 1024
|
6 |
+
hop_length: 256
|
7 |
+
n_mels: 100
|
8 |
+
padding: center
|
9 |
+
|
10 |
+
backbone:
|
11 |
+
class_path: vocos.models.VocosBackbone
|
12 |
+
init_args:
|
13 |
+
input_channels: 100
|
14 |
+
dim: 512
|
15 |
+
intermediate_dim: 1536
|
16 |
+
num_layers: 8
|
17 |
+
|
18 |
+
head:
|
19 |
+
class_path: vocos.heads.ISTFTHead
|
20 |
+
init_args:
|
21 |
+
dim: 512
|
22 |
+
n_fft: 1024
|
23 |
+
hop_length: 256
|
24 |
+
padding: center
|