Spaces:
Running
on
Zero
Running
on
Zero
Upload 3 files
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ import numpy as np
|
|
7 |
import os
|
8 |
import phonemizer
|
9 |
import random
|
|
|
10 |
import spaces
|
11 |
import torch
|
12 |
import yaml
|
@@ -32,17 +33,23 @@ def normalize(text):
|
|
32 |
text = text.replace('Mr.', 'Mister')
|
33 |
text = text.replace('Ms.', 'Miss')
|
34 |
text = text.replace('Mrs.', 'Mrs')
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
phonemizers = dict(
|
38 |
a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
|
39 |
b=phonemizer.backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
|
40 |
-
j=Katsu()
|
41 |
)
|
42 |
|
43 |
-
def phonemize(text, voice):
|
44 |
lang = voice[0]
|
45 |
-
|
|
|
46 |
ps = phonemizers[lang].phonemize([text])
|
47 |
ps = ps[0] if ps else ''
|
48 |
# TODO: Custom phonemization rules?
|
@@ -50,6 +57,8 @@ def phonemize(text, voice):
|
|
50 |
# https://en.wiktionary.org/wiki/kokoro#English
|
51 |
ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
|
52 |
ps = ''.join(filter(lambda p: p in VOCAB, ps))
|
|
|
|
|
53 |
return ps.strip()
|
54 |
|
55 |
def length_to_mask(lengths):
|
@@ -69,11 +78,19 @@ def get_vocab():
|
|
69 |
return dicts
|
70 |
|
71 |
VOCAB = get_vocab()
|
|
|
|
|
|
|
|
|
72 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
73 |
|
74 |
snapshot = snapshot_download(repo_id='hexgrad/kokoro', allow_patterns=['*.pt', '*.pth', '*.yml'], use_auth_token=os.environ['TOKEN'])
|
75 |
config = yaml.safe_load(open(os.path.join(snapshot, 'config.yml')))
|
76 |
model = build_model(config['model_params'])
|
|
|
|
|
|
|
|
|
77 |
_ = [model[key].eval() for key in model]
|
78 |
_ = [model[key].to(device) for key in model]
|
79 |
for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_location='cpu', weights_only=True)['net'].items():
|
@@ -113,42 +130,45 @@ def s_curve(p):
|
|
113 |
|
114 |
SAMPLE_RATE = 24000
|
115 |
|
116 |
-
@spaces.GPU(duration=
|
117 |
@torch.no_grad()
|
118 |
def forward(tokens, voice, speed):
|
|
|
119 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
120 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
121 |
text_mask = length_to_mask(input_lengths).to(device)
|
122 |
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
|
123 |
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
|
124 |
-
ref_s = VOICES[voice]
|
125 |
s = ref_s[:, 128:]
|
126 |
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
127 |
x, _ = model.predictor.lstm(d)
|
128 |
duration = model.predictor.duration_proj(x)
|
129 |
duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
130 |
-
pred_dur = torch.round(duration
|
131 |
-
pred_aln_trg = torch.zeros(input_lengths,
|
132 |
c_frame = 0
|
133 |
for i in range(pred_aln_trg.size(0)):
|
134 |
-
pred_aln_trg[i, c_frame:c_frame +
|
135 |
-
c_frame +=
|
136 |
-
en =
|
137 |
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
|
138 |
t_en = model.text_encoder(tokens, input_lengths, text_mask)
|
139 |
-
asr =
|
140 |
-
|
141 |
-
return out.squeeze().cpu().numpy()
|
142 |
|
143 |
-
def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=
|
144 |
ps = ps or phonemize(text, voice)
|
145 |
-
tokens =
|
146 |
if not tokens:
|
147 |
return (None, '')
|
148 |
elif len(tokens) > 510:
|
149 |
tokens = tokens[:510]
|
150 |
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
|
151 |
-
|
|
|
|
|
|
|
|
|
152 |
if reduce_noise > 0:
|
153 |
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
|
154 |
opening_cut = max(0, int(opening_cut / speed))
|
@@ -156,7 +176,7 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=5000
|
|
156 |
out[:opening_cut] = 0
|
157 |
closing_cut = max(0, int(closing_cut / speed))
|
158 |
if closing_cut > 0:
|
159 |
-
out
|
160 |
ease_in = min(int(ease_in / speed), len(out)//2 - opening_cut)
|
161 |
for i in range(ease_in):
|
162 |
out[i+opening_cut] *= s_curve(i / ease_in)
|
@@ -165,7 +185,7 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=5000
|
|
165 |
out[-i-1-closing_cut] *= s_curve(i / ease_out)
|
166 |
return ((SAMPLE_RATE, out), ps)
|
167 |
|
168 |
-
with gr.Blocks() as
|
169 |
with gr.Row():
|
170 |
with gr.Column():
|
171 |
text = gr.Textbox(label='Input Text')
|
@@ -174,32 +194,196 @@ with gr.Blocks() as demo:
|
|
174 |
random_btn = gr.Button('Random Text', variant='secondary')
|
175 |
generate_btn = gr.Button('Generate', variant='primary')
|
176 |
random_btn.click(get_random_text, inputs=[voice], outputs=[text])
|
177 |
-
with gr.Accordion('Input
|
178 |
-
in_ps = gr.Textbox(show_label=False, info='Override the input text with custom
|
179 |
with gr.Row():
|
180 |
clear_btn = gr.ClearButton(in_ps)
|
181 |
-
phonemize_btn = gr.Button('
|
182 |
phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
|
183 |
with gr.Column():
|
184 |
audio = gr.Audio(interactive=False, label='Output Audio')
|
185 |
-
with gr.Accordion('Tokens', open=True):
|
186 |
-
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio. Same as input
|
187 |
-
with gr.Accordion('
|
188 |
with gr.Row():
|
189 |
reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
|
190 |
with gr.Row():
|
191 |
-
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The
|
192 |
with gr.Row():
|
193 |
with gr.Column():
|
194 |
-
opening_cut = gr.Slider(minimum=0, maximum=24000, value=
|
195 |
with gr.Column():
|
196 |
-
closing_cut = gr.Slider(minimum=0, maximum=24000, value=
|
197 |
with gr.Row():
|
198 |
with gr.Column():
|
199 |
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
|
200 |
with gr.Column():
|
201 |
-
ease_out = gr.Slider(minimum=0, maximum=24000, value=
|
202 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out], outputs=[audio, out_ps])
|
203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
if __name__ == '__main__':
|
205 |
-
|
|
|
7 |
import os
|
8 |
import phonemizer
|
9 |
import random
|
10 |
+
import re
|
11 |
import spaces
|
12 |
import torch
|
13 |
import yaml
|
|
|
33 |
text = text.replace('Mr.', 'Mister')
|
34 |
text = text.replace('Ms.', 'Miss')
|
35 |
text = text.replace('Mrs.', 'Mrs')
|
36 |
+
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
37 |
+
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
|
38 |
+
text = re.sub(r'[^\S \n]', ' ', text)
|
39 |
+
text = re.sub(r' +', ' ', text)
|
40 |
+
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
|
41 |
+
return parens_to_angles(text).strip()
|
42 |
|
43 |
phonemizers = dict(
|
44 |
a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
|
45 |
b=phonemizer.backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
|
46 |
+
j=Katsu(),
|
47 |
)
|
48 |
|
49 |
+
def phonemize(text, voice, norm=True):
|
50 |
lang = voice[0]
|
51 |
+
if norm:
|
52 |
+
text = normalize(text)
|
53 |
ps = phonemizers[lang].phonemize([text])
|
54 |
ps = ps[0] if ps else ''
|
55 |
# TODO: Custom phonemization rules?
|
|
|
57 |
# https://en.wiktionary.org/wiki/kokoro#English
|
58 |
ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
|
59 |
ps = ''.join(filter(lambda p: p in VOCAB, ps))
|
60 |
+
if lang == 'j' and any(p in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' for p in ps):
|
61 |
+
gr.Warning('Japanese tokenizer does not handle English letters.')
|
62 |
return ps.strip()
|
63 |
|
64 |
def length_to_mask(lengths):
|
|
|
78 |
return dicts
|
79 |
|
80 |
VOCAB = get_vocab()
|
81 |
+
|
82 |
+
def tokenize(ps):
|
83 |
+
return [i for i in map(VOCAB.get, ps) if i is not None]
|
84 |
+
|
85 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
86 |
|
87 |
snapshot = snapshot_download(repo_id='hexgrad/kokoro', allow_patterns=['*.pt', '*.pth', '*.yml'], use_auth_token=os.environ['TOKEN'])
|
88 |
config = yaml.safe_load(open(os.path.join(snapshot, 'config.yml')))
|
89 |
model = build_model(config['model_params'])
|
90 |
+
for key, value in model.items():
|
91 |
+
for module in value.children():
|
92 |
+
if isinstance(module, torch.nn.RNNBase):
|
93 |
+
module.flatten_parameters()
|
94 |
_ = [model[key].eval() for key in model]
|
95 |
_ = [model[key].to(device) for key in model]
|
96 |
for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_location='cpu', weights_only=True)['net'].items():
|
|
|
130 |
|
131 |
SAMPLE_RATE = 24000
|
132 |
|
133 |
+
@spaces.GPU(duration=1)
|
134 |
@torch.no_grad()
|
135 |
def forward(tokens, voice, speed):
|
136 |
+
ref_s = VOICES[voice]
|
137 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
138 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
139 |
text_mask = length_to_mask(input_lengths).to(device)
|
140 |
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
|
141 |
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
|
|
|
142 |
s = ref_s[:, 128:]
|
143 |
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
144 |
x, _ = model.predictor.lstm(d)
|
145 |
duration = model.predictor.duration_proj(x)
|
146 |
duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
147 |
+
pred_dur = torch.round(duration).clamp(min=1).long()
|
148 |
+
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
|
149 |
c_frame = 0
|
150 |
for i in range(pred_aln_trg.size(0)):
|
151 |
+
pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
|
152 |
+
c_frame += pred_dur[0,i].item()
|
153 |
+
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
|
154 |
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
|
155 |
t_en = model.text_encoder(tokens, input_lengths, text_mask)
|
156 |
+
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
157 |
+
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
|
|
158 |
|
159 |
+
def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000):
|
160 |
ps = ps or phonemize(text, voice)
|
161 |
+
tokens = tokenize(ps)
|
162 |
if not tokens:
|
163 |
return (None, '')
|
164 |
elif len(tokens) > 510:
|
165 |
tokens = tokens[:510]
|
166 |
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
|
167 |
+
try:
|
168 |
+
out = forward(tokens, voice, speed)
|
169 |
+
except gr.exceptions.Error as e:
|
170 |
+
raise gr.Error(e)
|
171 |
+
return (None, '')
|
172 |
if reduce_noise > 0:
|
173 |
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
|
174 |
opening_cut = max(0, int(opening_cut / speed))
|
|
|
176 |
out[:opening_cut] = 0
|
177 |
closing_cut = max(0, int(closing_cut / speed))
|
178 |
if closing_cut > 0:
|
179 |
+
out[-closing_cut:] = 0
|
180 |
ease_in = min(int(ease_in / speed), len(out)//2 - opening_cut)
|
181 |
for i in range(ease_in):
|
182 |
out[i+opening_cut] *= s_curve(i / ease_in)
|
|
|
185 |
out[-i-1-closing_cut] *= s_curve(i / ease_out)
|
186 |
return ((SAMPLE_RATE, out), ps)
|
187 |
|
188 |
+
with gr.Blocks() as basic_tts:
|
189 |
with gr.Row():
|
190 |
with gr.Column():
|
191 |
text = gr.Textbox(label='Input Text')
|
|
|
194 |
random_btn = gr.Button('Random Text', variant='secondary')
|
195 |
generate_btn = gr.Button('Generate', variant='primary')
|
196 |
random_btn.click(get_random_text, inputs=[voice], outputs=[text])
|
197 |
+
with gr.Accordion('Input Tokens', open=False):
|
198 |
+
in_ps = gr.Textbox(show_label=False, info='Override the input text with custom phonemes. Leave this blank to automatically tokenize the input text instead.')
|
199 |
with gr.Row():
|
200 |
clear_btn = gr.ClearButton(in_ps)
|
201 |
+
phonemize_btn = gr.Button('Tokenize Input Text', variant='primary')
|
202 |
phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
|
203 |
with gr.Column():
|
204 |
audio = gr.Audio(interactive=False, label='Output Audio')
|
205 |
+
with gr.Accordion('Output Tokens', open=True):
|
206 |
+
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
207 |
+
with gr.Accordion('Audio Settings', open=False):
|
208 |
with gr.Row():
|
209 |
reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
|
210 |
with gr.Row():
|
211 |
+
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
212 |
with gr.Row():
|
213 |
with gr.Column():
|
214 |
+
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Zero out this many samples at the start.')
|
215 |
with gr.Column():
|
216 |
+
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Zero out this many samples at the end.')
|
217 |
with gr.Row():
|
218 |
with gr.Column():
|
219 |
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
|
220 |
with gr.Column():
|
221 |
+
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
|
222 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out], outputs=[audio, out_ps])
|
223 |
|
224 |
+
@spaces.GPU
|
225 |
+
@torch.no_grad()
|
226 |
+
def lf_forward(token_lists, voice, speed):
|
227 |
+
ref_s = VOICES[voice]
|
228 |
+
s = ref_s[:, 128:]
|
229 |
+
outs = []
|
230 |
+
for tokens in token_lists:
|
231 |
+
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
232 |
+
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
233 |
+
text_mask = length_to_mask(input_lengths).to(device)
|
234 |
+
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
|
235 |
+
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
|
236 |
+
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
237 |
+
x, _ = model.predictor.lstm(d)
|
238 |
+
duration = model.predictor.duration_proj(x)
|
239 |
+
duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
240 |
+
pred_dur = torch.round(duration).clamp(min=1).long()
|
241 |
+
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
|
242 |
+
c_frame = 0
|
243 |
+
for i in range(pred_aln_trg.size(0)):
|
244 |
+
pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
|
245 |
+
c_frame += pred_dur[0,i].item()
|
246 |
+
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
|
247 |
+
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
|
248 |
+
t_en = model.text_encoder(tokens, input_lengths, text_mask)
|
249 |
+
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
250 |
+
outs.append(model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy())
|
251 |
+
return outs
|
252 |
+
|
253 |
+
def resplit_strings(arr):
|
254 |
+
# Handle edge cases
|
255 |
+
if not arr:
|
256 |
+
return '', ''
|
257 |
+
if len(arr) == 1:
|
258 |
+
return arr[0], ''
|
259 |
+
# Try each possible split point
|
260 |
+
min_diff = float('inf')
|
261 |
+
best_split = 0
|
262 |
+
# Calculate lengths when joined with spaces
|
263 |
+
lengths = [len(s) for s in arr]
|
264 |
+
spaces = len(arr) - 1 # Total spaces needed
|
265 |
+
# Try each split point
|
266 |
+
left_len = 0
|
267 |
+
right_len = sum(lengths) + spaces
|
268 |
+
for i in range(1, len(arr)):
|
269 |
+
# Add current word and space to left side
|
270 |
+
left_len += lengths[i-1] + (1 if i > 1 else 0)
|
271 |
+
# Remove current word and space from right side
|
272 |
+
right_len -= lengths[i-1] + 1
|
273 |
+
diff = abs(left_len - right_len)
|
274 |
+
if diff < min_diff:
|
275 |
+
min_diff = diff
|
276 |
+
best_split = i
|
277 |
+
# Join the strings with the best split point
|
278 |
+
return ' '.join(arr[:best_split]), ' '.join(arr[best_split:])
|
279 |
+
|
280 |
+
def recursive_split(text, voice):
|
281 |
+
if not text:
|
282 |
+
return []
|
283 |
+
tokens = phonemize(text, voice, norm=False)
|
284 |
+
if len(tokens) < 511:
|
285 |
+
return [(text, tokens, len(tokens))] if tokens else []
|
286 |
+
if ' ' not in text:
|
287 |
+
return []
|
288 |
+
for punctuation in ['!.?…', ':;', ',—']:
|
289 |
+
splits = re.split(f'(?:(?<=[{punctuation}])|(?<=[{punctuation}]["\'»])|(?<=[{punctuation}]["\'»]["\'»])) ', text)
|
290 |
+
if len(splits) > 1:
|
291 |
+
break
|
292 |
+
else:
|
293 |
+
splits = None
|
294 |
+
splits = splits or text.split(' ')
|
295 |
+
a, b = resplit_strings(splits)
|
296 |
+
return recursive_split(a, voice) + recursive_split(b, voice)
|
297 |
+
|
298 |
+
def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2):
|
299 |
+
if skip_square_brackets:
|
300 |
+
text = re.sub(r'\[.*?\]', '', text)
|
301 |
+
texts = [t.strip() for t in re.split('\n{'+str(newline_split)+',}', normalize(text))] if newline_split > 0 else [normalize(text)]
|
302 |
+
segments = [row for t in texts for row in recursive_split(t, voice)]
|
303 |
+
return [(i, *row) for i, row in enumerate(segments)]
|
304 |
+
|
305 |
+
def lf_generate(segments, voice, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad=5000):
|
306 |
+
token_lists = list(map(tokenize, segments['Tokens']))
|
307 |
+
wavs = []
|
308 |
+
opening_cut = max(0, int(opening_cut / speed))
|
309 |
+
closing_cut = max(0, int(closing_cut / speed))
|
310 |
+
pad = max(0, int(pad / speed))
|
311 |
+
batch_size = 100
|
312 |
+
for i in range(0, len(token_lists), batch_size):
|
313 |
+
try:
|
314 |
+
outs = lf_forward(token_lists[i:i+batch_size], voice, speed)
|
315 |
+
except gr.exceptions.Error as e:
|
316 |
+
if wavs:
|
317 |
+
gr.Warning(e)
|
318 |
+
else:
|
319 |
+
raise gr.Error(e)
|
320 |
+
break
|
321 |
+
for out in outs:
|
322 |
+
if reduce_noise > 0:
|
323 |
+
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
|
324 |
+
if opening_cut > 0:
|
325 |
+
out[:opening_cut] = 0
|
326 |
+
if closing_cut > 0:
|
327 |
+
out[-closing_cut:] = 0
|
328 |
+
ease_in = min(int(ease_in / speed), len(out)//2 - opening_cut)
|
329 |
+
for i in range(ease_in):
|
330 |
+
out[i+opening_cut] *= s_curve(i / ease_in)
|
331 |
+
ease_out = min(int(ease_out / speed), len(out)//2 - closing_cut)
|
332 |
+
for i in range(ease_out):
|
333 |
+
out[-i-1-closing_cut] *= s_curve(i / ease_out)
|
334 |
+
if wavs and pad > 0:
|
335 |
+
wavs.append(np.zeros(pad))
|
336 |
+
wavs.append(out)
|
337 |
+
return (SAMPLE_RATE, np.concatenate(wavs)) if wavs else None
|
338 |
+
|
339 |
+
def did_change_segments(segments):
|
340 |
+
x = len(segments) if segments['Length'].any() else 0
|
341 |
+
return [
|
342 |
+
gr.Button('Tokenize', variant='secondary' if x else 'primary'),
|
343 |
+
gr.Button(f'Generate x{x}', variant='primary' if x else 'secondary', interactive=x > 0),
|
344 |
+
]
|
345 |
+
|
346 |
+
with gr.Blocks() as lf_tts:
|
347 |
+
with gr.Row():
|
348 |
+
with gr.Column():
|
349 |
+
text = gr.Textbox(label='Input Text')
|
350 |
+
voice = gr.Dropdown(list(CHOICES.items()), label='Voice')
|
351 |
+
with gr.Accordion('Text Settings', open=False):
|
352 |
+
skip_square_brackets = gr.Checkbox(True, label='Skip [Square Brackets]', info='Recommended for academic papers, Wikipedia articles, or texts with citations.')
|
353 |
+
newline_split = gr.Number(2, label='Newline Split', info='Split the input text on this many newlines. Affects how the text is segmented.', precision=0, minimum=0)
|
354 |
+
with gr.Row():
|
355 |
+
segment_btn = gr.Button('Tokenize', variant='primary')
|
356 |
+
generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
|
357 |
+
with gr.Column():
|
358 |
+
audio = gr.Audio(interactive=False, label='Output Audio')
|
359 |
+
with gr.Accordion('Audio Settings', open=False):
|
360 |
+
with gr.Row():
|
361 |
+
reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
|
362 |
+
with gr.Row():
|
363 |
+
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
364 |
+
with gr.Row():
|
365 |
+
with gr.Column():
|
366 |
+
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Zero out this many samples at the start.')
|
367 |
+
with gr.Column():
|
368 |
+
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Zero out this many samples at the end.')
|
369 |
+
with gr.Row():
|
370 |
+
with gr.Column():
|
371 |
+
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
|
372 |
+
with gr.Column():
|
373 |
+
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
|
374 |
+
with gr.Row():
|
375 |
+
pad = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad', info='🔇 How many samples of silence to insert between segments.')
|
376 |
+
with gr.Row():
|
377 |
+
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
378 |
+
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
379 |
+
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
380 |
+
generate_btn.click(lf_generate, inputs=[segments, voice, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out, pad], outputs=[audio])
|
381 |
+
|
382 |
+
with gr.Blocks() as app:
|
383 |
+
gr.TabbedInterface(
|
384 |
+
[basic_tts, lf_tts],
|
385 |
+
['Basic TTS', 'Long-Form'],
|
386 |
+
)
|
387 |
+
|
388 |
if __name__ == '__main__':
|
389 |
+
app.launch()
|