Text-to-Speech
hexgrad commited on
Commit
8c61023
·
verified ·
1 Parent(s): 601807f

Upload 25 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ samples/HEARME_en.wav filter=lfs diff=lfs merge=lfs -text
37
+ samples/HEARME_zf_001.wav filter=lfs diff=lfs merge=lfs -text
38
+ samples/HEARME_zm_010.wav filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "istftnet": {
3
+ "upsample_kernel_sizes": [20, 12],
4
+ "upsample_rates": [10, 6],
5
+ "gen_istft_hop_size": 5,
6
+ "gen_istft_n_fft": 20,
7
+ "resblock_dilation_sizes": [
8
+ [1, 3, 5],
9
+ [1, 3, 5],
10
+ [1, 3, 5]
11
+ ],
12
+ "resblock_kernel_sizes": [3, 7, 11],
13
+ "upsample_initial_channel": 512
14
+ },
15
+ "dim_in": 64,
16
+ "dropout": 0.2,
17
+ "hidden_dim": 512,
18
+ "max_conv_dim": 512,
19
+ "max_dur": 50,
20
+ "multispeaker": true,
21
+ "n_layer": 3,
22
+ "n_mels": 80,
23
+ "n_token": 178,
24
+ "style_dim": 128,
25
+ "text_encoder_kernel_size": 5,
26
+ "plbert": {
27
+ "hidden_size": 768,
28
+ "num_attention_heads": 12,
29
+ "intermediate_size": 2048,
30
+ "max_position_embeddings": 512,
31
+ "num_hidden_layers": 12,
32
+ "dropout": 0.1
33
+ },
34
+ "vocab": {
35
+ ";": 1,
36
+ ":": 2,
37
+ ",": 3,
38
+ ".": 4,
39
+ "!": 5,
40
+ "?": 6,
41
+ "/": 7,
42
+ "—": 9,
43
+ "…": 10,
44
+ "\"": 11,
45
+ "(": 12,
46
+ ")": 13,
47
+ "“": 14,
48
+ "”": 15,
49
+ " ": 16,
50
+ "\u0303": 17,
51
+ "ʣ": 18,
52
+ "ʥ": 19,
53
+ "ʦ": 20,
54
+ "ʨ": 21,
55
+ "ᵝ": 22,
56
+ "ㄓ": 23,
57
+ "A": 24,
58
+ "I": 25,
59
+ "ㄅ": 30,
60
+ "O": 31,
61
+ "ㄆ": 32,
62
+ "Q": 33,
63
+ "R": 34,
64
+ "S": 35,
65
+ "T": 36,
66
+ "ㄇ": 37,
67
+ "ㄈ": 38,
68
+ "W": 39,
69
+ "ㄉ": 40,
70
+ "Y": 41,
71
+ "ᵊ": 42,
72
+ "a": 43,
73
+ "b": 44,
74
+ "c": 45,
75
+ "d": 46,
76
+ "e": 47,
77
+ "f": 48,
78
+ "ㄊ": 49,
79
+ "h": 50,
80
+ "i": 51,
81
+ "j": 52,
82
+ "k": 53,
83
+ "l": 54,
84
+ "m": 55,
85
+ "n": 56,
86
+ "o": 57,
87
+ "p": 58,
88
+ "q": 59,
89
+ "r": 60,
90
+ "s": 61,
91
+ "t": 62,
92
+ "u": 63,
93
+ "v": 64,
94
+ "w": 65,
95
+ "x": 66,
96
+ "y": 67,
97
+ "z": 68,
98
+ "ɑ": 69,
99
+ "ɐ": 70,
100
+ "ɒ": 71,
101
+ "æ": 72,
102
+ "ㄋ": 73,
103
+ "ㄌ": 74,
104
+ "β": 75,
105
+ "ɔ": 76,
106
+ "ɕ": 77,
107
+ "ç": 78,
108
+ "ㄍ": 79,
109
+ "ɖ": 80,
110
+ "ð": 81,
111
+ "ʤ": 82,
112
+ "ə": 83,
113
+ "ㄎ": 84,
114
+ "ㄦ": 85,
115
+ "ɛ": 86,
116
+ "ɜ": 87,
117
+ "ㄏ": 88,
118
+ "ㄐ": 89,
119
+ "ɟ": 90,
120
+ "ㄑ": 91,
121
+ "ɡ": 92,
122
+ "ㄒ": 93,
123
+ "ㄔ": 94,
124
+ "ㄕ": 95,
125
+ "ㄗ": 96,
126
+ "ㄘ": 97,
127
+ "ㄙ": 98,
128
+ "月": 99,
129
+ "ㄚ": 100,
130
+ "ɨ": 101,
131
+ "ɪ": 102,
132
+ "ʝ": 103,
133
+ "ㄛ": 104,
134
+ "ㄝ": 105,
135
+ "ㄞ": 106,
136
+ "ㄟ": 107,
137
+ "ㄠ": 108,
138
+ "ㄡ": 109,
139
+ "ɯ": 110,
140
+ "ɰ": 111,
141
+ "ŋ": 112,
142
+ "ɳ": 113,
143
+ "ɲ": 114,
144
+ "ɴ": 115,
145
+ "ø": 116,
146
+ "ㄢ": 117,
147
+ "ɸ": 118,
148
+ "θ": 119,
149
+ "œ": 120,
150
+ "ㄣ": 121,
151
+ "ㄤ": 122,
152
+ "ɹ": 123,
153
+ "ㄥ": 124,
154
+ "ɾ": 125,
155
+ "ㄖ": 126,
156
+ "ㄧ": 127,
157
+ "ʁ": 128,
158
+ "ɽ": 129,
159
+ "ʂ": 130,
160
+ "ʃ": 131,
161
+ "ʈ": 132,
162
+ "ʧ": 133,
163
+ "ㄨ": 134,
164
+ "ʊ": 135,
165
+ "ʋ": 136,
166
+ "ㄩ": 137,
167
+ "ʌ": 138,
168
+ "ɣ": 139,
169
+ "ㄜ": 140,
170
+ "ㄭ": 141,
171
+ "χ": 142,
172
+ "ʎ": 143,
173
+ "十": 144,
174
+ "压": 145,
175
+ "言": 146,
176
+ "ʒ": 147,
177
+ "ʔ": 148,
178
+ "阳": 149,
179
+ "要": 150,
180
+ "阴": 151,
181
+ "应": 152,
182
+ "用": 153,
183
+ "又": 154,
184
+ "中": 155,
185
+ "ˈ": 156,
186
+ "ˌ": 157,
187
+ "ː": 158,
188
+ "穵": 159,
189
+ "外": 160,
190
+ "万": 161,
191
+ "ʰ": 162,
192
+ "王": 163,
193
+ "ʲ": 164,
194
+ "为": 165,
195
+ "文": 166,
196
+ "瓮": 167,
197
+ "我": 168,
198
+ "3": 169,
199
+ "5": 170,
200
+ "1": 171,
201
+ "2": 172,
202
+ "4": 173,
203
+ "元": 175,
204
+ "云": 176,
205
+ "ᵻ": 177
206
+ }
207
+ }
kokoro-v1_1-zh.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1d8410fa44dfb5c15471fd6c4225ea6b4e9ac7fa03c98e8bea47a9928476e2b
3
+ size 327247856
samples/HEARME_en.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b759a65788991932d031d6fc8440f7a8efc402273fc1c2ca9d52ffd8a16a6666
3
+ size 4528044
samples/HEARME_zf_001.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c8685f06fd809ca2e892f8b71f3549d0640ab992b37648781f9138be33ef035
3
+ size 4267644
samples/HEARME_zm_010.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:915d93163e2e5319370b539b72a90c69c214c143206024c086c57e5fbdd67484
3
+ size 4253244
samples/make_en.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is hardcoded to transparently reproduce HEARME_en.wav
2
+ # Therefore it may NOT generalize gracefully to other texts
3
+ # Refer to Usage in README.md for more general usage patterns
4
+
5
+ # pip install kokoro>=0.8.1
6
+ from kokoro import KModel, KPipeline
7
+ from pathlib import Path
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import torch
11
+ import tqdm
12
+
13
+ REPO_ID = 'hexgrad/Kokoro-82M-v1.1-zh'
14
+ SAMPLE_RATE = 24000
15
+
16
+ # How much silence to insert between paragraphs: 5000 is about 0.2 seconds
17
+ N_ZEROS = 5000
18
+
19
+ # Whether to join sentences in paragraphs 1 and 3
20
+ JOIN_SENTENCES = True
21
+
22
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
23
+
24
+ texts = [(
25
+ "[Kokoro](/kˈQkəɹQ/) is an open-weight series of small but powerful TTS models.",
26
+ ), (
27
+ "This model is the result of a short training run that added 100 Chinese speakers from a professional dataset.",
28
+ "The Chinese data was freely and permissively granted to us by LongMaoData, a professional dataset company. Thank you for making this model possible.",
29
+ ), (
30
+ "Separately, some crowdsourced synthetic English data also entered the training mix:",
31
+ "1 hour of Maple, an American female.",
32
+ "1 hour of [Sol](/sˈOl/), another American female.",
33
+ "And 1 hour of Vale, an older British female.",
34
+ ), (
35
+ "This model is not a strict upgrade over its predecessor since it drops many voices, but it is released early to gather feedback on new voices and tokenization.",
36
+ "Aside from the Chinese dataset and the 3 hours of English, the rest of the data was left behind for this training run.",
37
+ "The goal is to push the model series forward and ultimately restore some of the voices that were left behind.",
38
+ ), (
39
+ "Current guidance from the U.S. Copyright Office indicates that synthetic data generally does not qualify for copyright protection.",
40
+ "Since this synthetic data is crowdsourced, the model trainer is not bound by any Terms of Service.",
41
+ "This Apache licensed model also aligns with OpenAI's stated mission of broadly distributing the benefits of AI.",
42
+ "If you would like to help further that mission, consider contributing permissive audio data to the cause.",
43
+ )]
44
+
45
+ if JOIN_SENTENCES:
46
+ for i in (1, 3):
47
+ texts[i] = [' '.join(texts[i])]
48
+
49
+ model = KModel(repo_id=REPO_ID).to(device).eval()
50
+ en_pipelines = [KPipeline(lang_code='b' if british else 'a', repo_id=REPO_ID, model=model) for british in (False, True)]
51
+
52
+ path = Path(__file__).parent
53
+
54
+ wavs = []
55
+ for paragraph in tqdm.tqdm(texts):
56
+ for i, sentence in enumerate(paragraph):
57
+ voice, british = 'bf_vale', True
58
+ if 'Maple' in sentence:
59
+ voice, british = 'af_maple', False
60
+ elif 'Sol' in sentence:
61
+ voice, british = 'af_sol', False
62
+ generator = en_pipelines[british](sentence, voice=voice)
63
+ f = path / f'en{len(wavs):02}.wav'
64
+ result = next(generator)
65
+ wav = result.audio
66
+ sf.write(f, wav, SAMPLE_RATE)
67
+ if i == 0 and wavs and N_ZEROS > 0:
68
+ wav = np.concatenate([np.zeros(N_ZEROS), wav])
69
+ wavs.append(wav)
70
+
71
+ sf.write(path / 'HEARME_en.wav', np.concatenate(wavs), SAMPLE_RATE)
samples/make_zh.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is hardcoded to transparently reproduce HEARME_zh.wav
2
+ # Therefore it may NOT generalize gracefully to other texts
3
+ # Refer to Usage in README.md for more general usage patterns
4
+
5
+ # pip install kokoro>=0.8.1 "misaki[zh]>=0.8.1"
6
+ from kokoro import KModel, KPipeline
7
+ from pathlib import Path
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import torch
11
+ import tqdm
12
+
13
+ REPO_ID = 'hexgrad/Kokoro-82M-v1.1-zh'
14
+ SAMPLE_RATE = 24000
15
+
16
+ # How much silence to insert between paragraphs: 5000 is about 0.2 seconds
17
+ N_ZEROS = 5000
18
+
19
+ # Whether to join sentences in paragraphs 1 and 3
20
+ JOIN_SENTENCES = True
21
+
22
+ VOICE = 'zf_001' if True else 'zm_010'
23
+
24
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
25
+
26
+ texts = [(
27
+ "Kokoro 是一系列体积虽小但功能强大的 TTS 模型。",
28
+ ), (
29
+ "该模型是经过短期训练的结果,从专业数据集中添加了100名中文使用者。",
30
+ "中文数据由专业数据集公司「龙猫数据」免费且无偿地提供给我们。感谢你们让这个模型成为可能。",
31
+ ), (
32
+ "另外,一些众包合成英语数据也进入了训练组合:",
33
+ "1小时的 Maple,美国女性。",
34
+ "1小时的 Sol,另一位美国女性。",
35
+ "和1小时的 Vale,一位年长的英国女性。",
36
+ ), (
37
+ "由于该模型删除了许多声音,因此它并不是对其前身的严格升级,但它提前发布以收集有关新声音和标记化的反馈。",
38
+ "除了中文数据集和3小时的英语之外,其余数据都留在本次训练中。",
39
+ "目标是推动模型系列的发展,并最终恢复一些被遗留的声音。",
40
+ ), (
41
+ "美国版权局目前的指导表明,合成数据通常不符合版权保护的资格。",
42
+ "由于这些合成数据是众包的,因此模型训练师不受任何服务条款的约束。",
43
+ "该 Apache 许可模式也符合 OpenAI 所宣称的广泛传播 AI 优势的使命。",
44
+ "如果您愿意帮助进一步完成这一使命,请考虑为此贡献许可的音频数据。",
45
+ )]
46
+
47
+ if JOIN_SENTENCES:
48
+ for i in (1, 3):
49
+ texts[i] = [''.join(texts[i])]
50
+
51
+ en_pipeline = KPipeline(lang_code='a', repo_id=REPO_ID, model=False)
52
+ def en_callable(text):
53
+ if text == 'Kokoro':
54
+ return 'kˈOkəɹO'
55
+ elif text == 'Sol':
56
+ return 'sˈOl'
57
+ return next(en_pipeline(text)).phonemes
58
+
59
+ # HACK: Mitigate rushing caused by lack of training data beyond ~100 tokens
60
+ # Simple piecewise linear fn that decreases speed as len_ps increases
61
+ def speed_callable(len_ps):
62
+ speed = 0.8
63
+ if len_ps <= 83:
64
+ speed = 1
65
+ elif len_ps < 183:
66
+ speed = 1 - (len_ps - 83) / 500
67
+ return speed * 1.1
68
+
69
+ # model = KModel(repo_id=REPO_ID).to(device).eval()
70
+ zh_pipeline = KPipeline(lang_code='z', repo_id=REPO_ID, model=model, en_callable=en_callable)
71
+
72
+ path = Path(__file__).parent
73
+
74
+ wavs = []
75
+ for paragraph in tqdm.tqdm(texts):
76
+ for i, sentence in enumerate(paragraph):
77
+ generator = zh_pipeline(sentence, voice=voice, speed=speed_callable)
78
+ f = path / f'zh{len(wavs):02}.wav'
79
+ result = next(generator)
80
+ wav = result.audio
81
+ sf.write(f, wav, SAMPLE_RATE)
82
+ if i == 0 and wavs and N_ZEROS > 0:
83
+ wav = np.concatenate([np.zeros(N_ZEROS), wav])
84
+ wavs.append(wav)
85
+
86
+ sf.write(path / f'HEARME_{voice}.wav', np.concatenate(wavs), SAMPLE_RATE)
voices/af_maple.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1211a6b94795d843cb7957568ccf2208e6ce76d2fbb36c7279b24e1be9b862f
3
+ size 523425
voices/af_sol.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d24aad751d7f62618506264c1cf3436276901447d85f1209231e9be29da4261
3
+ size 523351
voices/bf_vale.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e66bc4578345d490985ce73c49464e6f6a9e7c58586b99a9ae14c988ae14e01f
3
+ size 523420
voices/zf_001.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bdc9a87e13e9bb1ea3e7803259c2ecbfebaeeb2ff80b5d0c76df1a464c1c962
3
+ size 523331
voices/zf_002.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c8cf221ff2e0915fc807cac5f233f42798ee8e2bd58bc5ad0259fd95e405a26
3
+ size 523331
voices/zf_003.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac28a59eefaa7e37b2aabffc792d40081392aa89d679b579859debf5209441a1
3
+ size 523331
voices/zf_004.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d50c3a87071a11d703d9d4ff7dd1f77fe6b8c5c3a9e60e81bc848816c0e959f
3
+ size 523331
voices/zf_005.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64d656103a908954496676529f4e8dee783afd4c8dccd1a9042cd8dbe05e39f4
3
+ size 523331
voices/zf_006.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef341ad2c4ec5dab3bf32daa0a70b8779c5aba10a9e18f57e5b6b29c7ec93d37
3
+ size 523331
voices/zf_007.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52a966710a29b50d9d11df15b5572c28062d2edf89585fe2c14abe281e2e49a8
3
+ size 523331
voices/zf_008.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:361c1da6b087284a66c803d413225a09d57334ab515a93d5e16a2d553d9941f6
3
+ size 523331
voices/zm_009.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eff75f26089f9f986b547985a420f901661057a951088ac7c7d8473a8d6327bd
3
+ size 523331
voices/zm_010.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2eeba86192eee269f600ca6821038034abd017532a1fe68ff7b0e86c2983b2a
3
+ size 523331
voices/zm_011.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:978f00b0485869b2461249235429352767c661b0eeef65c37ae393a5c1531f46
3
+ size 523331
voices/zm_012.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a94386850fc6e115e298c50583ea8b38eabe415a138fca756cf4f14ca63c1b4
3
+ size 523331
voices/zm_013.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef528a5444ab0001bb32c8149ddf44a53412af615f81e050f95d5b05fd10c34a
3
+ size 523331
voices/zm_014.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9eacf15e6b9b073e44f7e62f7ed8582ad774da6cbaaa5dd707839e8af3ba6855
3
+ size 523331
voices/zm_015.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad69ea6143d656dec64997412c7cc60d3fd9e6f7bc27b5bdafebc6bcf3c70a68
3
+ size 523331