hexgrad commited on 11 days ago

Commit

8c61023

verified ·

1 Parent(s): 601807f

Upload 25 files

Browse files

Files changed (26) hide show

.gitattributes +3 -0
config.json +207 -0
kokoro-v1_1-zh.pth +3 -0
samples/HEARME_en.wav +3 -0
samples/HEARME_zf_001.wav +3 -0
samples/HEARME_zm_010.wav +3 -0
samples/make_en.py +71 -0
samples/make_zh.py +86 -0
voices/af_maple.pt +3 -0
voices/af_sol.pt +3 -0
voices/bf_vale.pt +3 -0
voices/zf_001.pt +3 -0
voices/zf_002.pt +3 -0
voices/zf_003.pt +3 -0
voices/zf_004.pt +3 -0
voices/zf_005.pt +3 -0
voices/zf_006.pt +3 -0
voices/zf_007.pt +3 -0
voices/zf_008.pt +3 -0
voices/zm_009.pt +3 -0
voices/zm_010.pt +3 -0
voices/zm_011.pt +3 -0
voices/zm_012.pt +3 -0
voices/zm_013.pt +3 -0
voices/zm_014.pt +3 -0
voices/zm_015.pt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+samples/HEARME_en.wav filter=lfs diff=lfs merge=lfs -text
+samples/HEARME_zf_001.wav filter=lfs diff=lfs merge=lfs -text
+samples/HEARME_zm_010.wav filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "istftnet": {
+    "upsample_kernel_sizes": [20, 12],
+    "upsample_rates": [10, 6],
+    "gen_istft_hop_size": 5,
+    "gen_istft_n_fft": 20,
+    "resblock_dilation_sizes": [
+      [1, 3, 5],
+      [1, 3, 5],
+      [1, 3, 5]
+    ],
+    "resblock_kernel_sizes": [3, 7, 11],
+    "upsample_initial_channel": 512
+  },
+  "dim_in": 64,
+  "dropout": 0.2,
+  "hidden_dim": 512,
+  "max_conv_dim": 512,
+  "max_dur": 50,
+  "multispeaker": true,
+  "n_layer": 3,
+  "n_mels": 80,
+  "n_token": 178,
+  "style_dim": 128,
+  "text_encoder_kernel_size": 5,
+  "plbert": {
+    "hidden_size": 768,
+    "num_attention_heads": 12,
+    "intermediate_size": 2048,
+    "max_position_embeddings": 512,
+    "num_hidden_layers": 12,
+    "dropout": 0.1
+  },
+  "vocab": {
+    ";": 1,
+    ":": 2,
+    ",": 3,
+    ".": 4,
+    "!": 5,
+    "?": 6,
+    "/": 7,
+    "—": 9,
+    "…": 10,
+    "\"": 11,
+    "(": 12,
+    ")": 13,
+    "“": 14,
+    "”": 15,
+    " ": 16,
+    "\u0303": 17,
+    "ʣ": 18,
+    "ʥ": 19,
+    "ʦ": 20,
+    "ʨ": 21,
+    "ᵝ": 22,
+    "ㄓ": 23,
+    "A": 24,
+    "I": 25,
+    "ㄅ": 30,
+    "O": 31,
+    "ㄆ": 32,
+    "Q": 33,
+    "R": 34,
+    "S": 35,
+    "T": 36,
+    "ㄇ": 37,
+    "ㄈ": 38,
+    "W": 39,
+    "ㄉ": 40,
+    "Y": 41,
+    "ᵊ": 42,
+    "a": 43,
+    "b": 44,
+    "c": 45,
+    "d": 46,
+    "e": 47,
+    "f": 48,
+    "ㄊ": 49,
+    "h": 50,
+    "i": 51,
+    "j": 52,
+    "k": 53,
+    "l": 54,
+    "m": 55,
+    "n": 56,
+    "o": 57,
+    "p": 58,
+    "q": 59,
+    "r": 60,
+    "s": 61,
+    "t": 62,
+    "u": 63,
+    "v": 64,
+    "w": 65,
+    "x": 66,
+    "y": 67,
+    "z": 68,
+    "ɑ": 69,
+    "ɐ": 70,
+    "ɒ": 71,
+    "æ": 72,
+    "ㄋ": 73,
+    "ㄌ": 74,
+    "β": 75,
+    "ɔ": 76,
+    "ɕ": 77,
+    "ç": 78,
+    "ㄍ": 79,
+    "ɖ": 80,
+    "ð": 81,
+    "ʤ": 82,
+    "ə": 83,
+    "ㄎ": 84,
+    "ㄦ": 85,
+    "ɛ": 86,
+    "ɜ": 87,
+    "ㄏ": 88,
+    "ㄐ": 89,
+    "ɟ": 90,
+    "ㄑ": 91,
+    "ɡ": 92,
+    "ㄒ": 93,
+    "ㄔ": 94,
+    "ㄕ": 95,
+    "ㄗ": 96,
+    "ㄘ": 97,
+    "ㄙ": 98,
+    "月": 99,
+    "ㄚ": 100,
+    "ɨ": 101,
+    "ɪ": 102,
+    "ʝ": 103,
+    "ㄛ": 104,
+    "ㄝ": 105,
+    "ㄞ": 106,
+    "ㄟ": 107,
+    "ㄠ": 108,
+    "ㄡ": 109,
+    "ɯ": 110,
+    "ɰ": 111,
+    "ŋ": 112,
+    "ɳ": 113,
+    "ɲ": 114,
+    "ɴ": 115,
+    "ø": 116,
+    "ㄢ": 117,
+    "ɸ": 118,
+    "θ": 119,
+    "œ": 120,
+    "ㄣ": 121,
+    "ㄤ": 122,
+    "ɹ": 123,
+    "ㄥ": 124,
+    "ɾ": 125,
+    "ㄖ": 126,
+    "ㄧ": 127,
+    "ʁ": 128,
+    "ɽ": 129,
+    "ʂ": 130,
+    "ʃ": 131,
+    "ʈ": 132,
+    "ʧ": 133,
+    "ㄨ": 134,
+    "ʊ": 135,
+    "ʋ": 136,
+    "ㄩ": 137,
+    "ʌ": 138,
+    "ɣ": 139,
+    "ㄜ": 140,
+    "ㄭ": 141,
+    "χ": 142,
+    "ʎ": 143,
+    "十": 144,
+    "压": 145,
+    "言": 146,
+    "ʒ": 147,
+    "ʔ": 148,
+    "阳": 149,
+    "要": 150,
+    "阴": 151,
+    "应": 152,
+    "用": 153,
+    "又": 154,
+    "中": 155,
+    "ˈ": 156,
+    "ˌ": 157,
+    "ː": 158,
+    "穵": 159,
+    "外": 160,
+    "万": 161,
+    "ʰ": 162,
+    "王": 163,
+    "ʲ": 164,
+    "为": 165,
+    "文": 166,
+    "瓮": 167,
+    "我": 168,
+    "3": 169,
+    "5": 170,
+    "1": 171,
+    "2": 172,
+    "4": 173,
+    "元": 175,
+    "云": 176,
+    "ᵻ": 177
+  }
+}

kokoro-v1_1-zh.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1d8410fa44dfb5c15471fd6c4225ea6b4e9ac7fa03c98e8bea47a9928476e2b
+size 327247856

samples/HEARME_en.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b759a65788991932d031d6fc8440f7a8efc402273fc1c2ca9d52ffd8a16a6666
+size 4528044

samples/HEARME_zf_001.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c8685f06fd809ca2e892f8b71f3549d0640ab992b37648781f9138be33ef035
+size 4267644

samples/HEARME_zm_010.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:915d93163e2e5319370b539b72a90c69c214c143206024c086c57e5fbdd67484
+size 4253244

samples/make_en.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# This file is hardcoded to transparently reproduce HEARME_en.wav
+# Therefore it may NOT generalize gracefully to other texts
+# Refer to Usage in README.md for more general usage patterns
+# pip install kokoro>=0.8.1
+from kokoro import KModel, KPipeline
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+import torch
+import tqdm
+REPO_ID = 'hexgrad/Kokoro-82M-v1.1-zh'
+SAMPLE_RATE = 24000
+# How much silence to insert between paragraphs: 5000 is about 0.2 seconds
+N_ZEROS = 5000
+# Whether to join sentences in paragraphs 1 and 3
+JOIN_SENTENCES = True
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+texts = [(
+"[Kokoro](/kˈQkəɹQ/) is an open-weight series of small but powerful TTS models.",
+), (
+"This model is the result of a short training run that added 100 Chinese speakers from a professional dataset.",
+"The Chinese data was freely and permissively granted to us by LongMaoData, a professional dataset company. Thank you for making this model possible.",
+), (
+"Separately, some crowdsourced synthetic English data also entered the training mix:",
+"1 hour of Maple, an American female.",
+"1 hour of [Sol](/sˈOl/), another American female.",
+"And 1 hour of Vale, an older British female.",
+), (
+"This model is not a strict upgrade over its predecessor since it drops many voices, but it is released early to gather feedback on new voices and tokenization.",
+"Aside from the Chinese dataset and the 3 hours of English, the rest of the data was left behind for this training run.",
+"The goal is to push the model series forward and ultimately restore some of the voices that were left behind.",
+), (
+"Current guidance from the U.S. Copyright Office indicates that synthetic data generally does not qualify for copyright protection.",
+"Since this synthetic data is crowdsourced, the model trainer is not bound by any Terms of Service.",
+"This Apache licensed model also aligns with OpenAI's stated mission of broadly distributing the benefits of AI.",
+"If you would like to help further that mission, consider contributing permissive audio data to the cause.",
+)]
+if JOIN_SENTENCES:
+    for i in (1, 3):
+        texts[i] = [' '.join(texts[i])]
+model = KModel(repo_id=REPO_ID).to(device).eval()
+en_pipelines = [KPipeline(lang_code='b' if british else 'a', repo_id=REPO_ID, model=model) for british in (False, True)]
+path = Path(__file__).parent
+wavs = []
+for paragraph in tqdm.tqdm(texts):
+    for i, sentence in enumerate(paragraph):
+        voice, british = 'bf_vale', True
+        if 'Maple' in sentence:
+            voice, british = 'af_maple', False
+        elif 'Sol' in sentence:
+            voice, british = 'af_sol', False
+        generator = en_pipelines[british](sentence, voice=voice)
+        f = path / f'en{len(wavs):02}.wav'
+        result = next(generator)
+        wav = result.audio
+        sf.write(f, wav, SAMPLE_RATE)
+        if i == 0 and wavs and N_ZEROS > 0:
+            wav = np.concatenate([np.zeros(N_ZEROS), wav])
+        wavs.append(wav)
+sf.write(path / 'HEARME_en.wav', np.concatenate(wavs), SAMPLE_RATE)

samples/make_zh.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# This file is hardcoded to transparently reproduce HEARME_zh.wav
+# Therefore it may NOT generalize gracefully to other texts
+# Refer to Usage in README.md for more general usage patterns
+# pip install kokoro>=0.8.1 "misaki[zh]>=0.8.1"
+from kokoro import KModel, KPipeline
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+import torch
+import tqdm
+REPO_ID = 'hexgrad/Kokoro-82M-v1.1-zh'
+SAMPLE_RATE = 24000
+# How much silence to insert between paragraphs: 5000 is about 0.2 seconds
+N_ZEROS = 5000
+# Whether to join sentences in paragraphs 1 and 3
+JOIN_SENTENCES = True
+VOICE = 'zf_001' if True else 'zm_010'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+texts = [(
+"Kokoro 是一系列体积虽小但功能强大的 TTS 模型。",
+), (
+"该模型是经过短期训练的结果，从专业数据集中添加了100名中文使用者。",
+"中文数据由专业数据集公司「龙猫数据」免费且无偿地提供给我们。感谢你们让这个模型成为可能。",
+), (
+"另外，一些众包合成英语数据也进入了训练组合：",
+"1小时的 Maple，美国女性。",
+"1小时的 Sol，另一位美国女性。",
+"和1小时的 Vale，一位年长的英国女性。",
+), (
+"由于该模型删除了许多声音，因此它并不是对其前身的严格升级，但它提前发布以收集有关新声音和标记化的反馈。",
+"除了中文数据集和3小时的英语之外，其余数据都留在本次训练中。",
+"目标是推动模型系列的发展，并最终恢复一些被遗留的声音。",
+), (
+"美国版权局目前的指导表明，合成数据通常不符合版权保护的资格。",
+"由于这些合成数据是众包的，因此模型训练师不受任何服务条款的约束。",
+"该 Apache 许可模式也符合 OpenAI 所宣称的广泛传播 AI 优势的使命。",
+"如果您愿意帮助进一步完成这一使命，请考虑为此贡献许可的音频数据。",
+)]
+if JOIN_SENTENCES:
+    for i in (1, 3):
+        texts[i] = [''.join(texts[i])]
+en_pipeline = KPipeline(lang_code='a', repo_id=REPO_ID, model=False)
+def en_callable(text):
+    if text == 'Kokoro':
+        return 'kˈOkəɹO'
+    elif text == 'Sol':
+        return 'sˈOl'
+    return next(en_pipeline(text)).phonemes
+# HACK: Mitigate rushing caused by lack of training data beyond ~100 tokens
+# Simple piecewise linear fn that decreases speed as len_ps increases
+def speed_callable(len_ps):
+    speed = 0.8
+    if len_ps <= 83:
+        speed = 1
+    elif len_ps < 183:
+        speed = 1 - (len_ps - 83) / 500
+    return speed * 1.1
+# model = KModel(repo_id=REPO_ID).to(device).eval()
+zh_pipeline = KPipeline(lang_code='z', repo_id=REPO_ID, model=model, en_callable=en_callable)
+path = Path(__file__).parent
+wavs = []
+for paragraph in tqdm.tqdm(texts):
+    for i, sentence in enumerate(paragraph):
+        generator = zh_pipeline(sentence, voice=voice, speed=speed_callable)
+        f = path / f'zh{len(wavs):02}.wav'
+        result = next(generator)
+        wav = result.audio
+        sf.write(f, wav, SAMPLE_RATE)
+        if i == 0 and wavs and N_ZEROS > 0:
+            wav = np.concatenate([np.zeros(N_ZEROS), wav])
+        wavs.append(wav)
+sf.write(path / f'HEARME_{voice}.wav', np.concatenate(wavs), SAMPLE_RATE)

voices/af_maple.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1211a6b94795d843cb7957568ccf2208e6ce76d2fbb36c7279b24e1be9b862f
+size 523425

voices/af_sol.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d24aad751d7f62618506264c1cf3436276901447d85f1209231e9be29da4261
+size 523351

voices/bf_vale.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e66bc4578345d490985ce73c49464e6f6a9e7c58586b99a9ae14c988ae14e01f
+size 523420

voices/zf_001.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bdc9a87e13e9bb1ea3e7803259c2ecbfebaeeb2ff80b5d0c76df1a464c1c962
+size 523331

voices/zf_002.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c8cf221ff2e0915fc807cac5f233f42798ee8e2bd58bc5ad0259fd95e405a26
+size 523331

voices/zf_003.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac28a59eefaa7e37b2aabffc792d40081392aa89d679b579859debf5209441a1
+size 523331

voices/zf_004.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d50c3a87071a11d703d9d4ff7dd1f77fe6b8c5c3a9e60e81bc848816c0e959f
+size 523331

voices/zf_005.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64d656103a908954496676529f4e8dee783afd4c8dccd1a9042cd8dbe05e39f4
+size 523331

voices/zf_006.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef341ad2c4ec5dab3bf32daa0a70b8779c5aba10a9e18f57e5b6b29c7ec93d37
+size 523331

voices/zf_007.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52a966710a29b50d9d11df15b5572c28062d2edf89585fe2c14abe281e2e49a8
+size 523331

voices/zf_008.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:361c1da6b087284a66c803d413225a09d57334ab515a93d5e16a2d553d9941f6
+size 523331

voices/zm_009.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eff75f26089f9f986b547985a420f901661057a951088ac7c7d8473a8d6327bd
+size 523331

voices/zm_010.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2eeba86192eee269f600ca6821038034abd017532a1fe68ff7b0e86c2983b2a
+size 523331

voices/zm_011.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:978f00b0485869b2461249235429352767c661b0eeef65c37ae393a5c1531f46
+size 523331

voices/zm_012.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a94386850fc6e115e298c50583ea8b38eabe415a138fca756cf4f14ca63c1b4
+size 523331

voices/zm_013.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef528a5444ab0001bb32c8149ddf44a53412af615f81e050f95d5b05fd10c34a
+size 523331

voices/zm_014.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9eacf15e6b9b073e44f7e62f7ed8582ad774da6cbaaa5dd707839e8af3ba6855
+size 523331

voices/zm_015.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad69ea6143d656dec64997412c7cc60d3fd9e6f7bc27b5bdafebc6bcf3c70a68
+size 523331