Upload 3 files
Browse files
@@ -7,6 +7,7 @@ import numpy as np
7 |
import os
8 |
import phonemizer
9 |
import random
10 |
import spaces
11 |
import torch
12 |
import yaml
@@ -32,17 +33,23 @@ def normalize(text):
32 |
text = text.replace('Mr.', 'Mister')
33 |
text = text.replace('Ms.', 'Miss')
34 |
text = text.replace('Mrs.', 'Mrs')
35 |
36 |
37 |
phonemizers = dict(
38 |
a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
39 |
b=phonemizer.backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
40 |
41 |
42 |
43 |
def phonemize(text, voice):
44 |
lang = voice[0]
45 |
46 |
ps = phonemizers[lang].phonemize([text])
47 |
ps = ps[0] if ps else ''
48 |
# TODO: Custom phonemization rules?
@@ -50,6 +57,8 @@ def phonemize(text, voice):
50 |
51 |
ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
52 |
ps = ''.join(filter(lambda p: p in VOCAB, ps))
53 |
return ps.strip()
54 |
55 |
def length_to_mask(lengths):
@@ -69,11 +78,19 @@ def get_vocab():
69 |
return dicts
70 |
71 |
VOCAB = get_vocab()
72 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
73 |
74 |
snapshot = snapshot_download(repo_id='hexgrad/kokoro', allow_patterns=['*.pt', '*.pth', '*.yml'], use_auth_token=os.environ['TOKEN'])
75 |
config = yaml.safe_load(open(os.path.join(snapshot, 'config.yml')))
76 |
model = build_model(config['model_params'])
77 |
_ = [model[key].eval() for key in model]
78 |
_ = [model[key].to(device) for key in model]
79 |
for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_location='cpu', weights_only=True)['net'].items():
@@ -113,42 +130,45 @@ def s_curve(p):
113 |
114 |
115 |
116 |
117 |
118 |
def forward(tokens, voice, speed):
119 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
120 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
121 |
text_mask = length_to_mask(input_lengths).to(device)
122 |
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
123 |
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
124 |
ref_s = VOICES[voice]
125 |
s = ref_s[:, 128:]
126 |
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
127 |
x, _ = model.predictor.lstm(d)
128 |
duration = model.predictor.duration_proj(x)
129 |
duration = torch.sigmoid(duration).sum(axis=-1) / speed
130 |
pred_dur = torch.round(duration
131 |
pred_aln_trg = torch.zeros(input_lengths,
132 |
c_frame = 0
133 |
for i in range(pred_aln_trg.size(0)):
134 |
pred_aln_trg[i, c_frame:c_frame +
135 |
c_frame +=
136 |
en =
137 |
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
138 |
t_en = model.text_encoder(tokens, input_lengths, text_mask)
139 |
asr =
140 |
141 |
return out.squeeze().cpu().numpy()
142 |
143 |
def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=
144 |
ps = ps or phonemize(text, voice)
145 |
tokens =
146 |
if not tokens:
147 |
return (None, '')
148 |
elif len(tokens) > 510:
149 |
tokens = tokens[:510]
150 |
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
151 |
152 |
if reduce_noise > 0:
153 |
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
154 |
opening_cut = max(0, int(opening_cut / speed))
@@ -156,7 +176,7 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=5000
156 |
out[:opening_cut] = 0
157 |
closing_cut = max(0, int(closing_cut / speed))
158 |
if closing_cut > 0:
159 |
160 |
ease_in = min(int(ease_in / speed), len(out)//2 - opening_cut)
161 |
for i in range(ease_in):
162 |
out[i+opening_cut] *= s_curve(i / ease_in)
@@ -165,7 +185,7 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=5000
165 |
out[-i-1-closing_cut] *= s_curve(i / ease_out)
166 |
return ((SAMPLE_RATE, out), ps)
167 |
168 |
with gr.Blocks() as
169 |
with gr.Row():
170 |
with gr.Column():
171 |
text = gr.Textbox(label='Input Text')
@@ -174,32 +194,196 @@ with gr.Blocks() as demo:
174 |
random_btn = gr.Button('Random Text', variant='secondary')
175 |
generate_btn = gr.Button('Generate', variant='primary')
176 |, inputs=[voice], outputs=[text])
177 |
with gr.Accordion('Input
178 |
in_ps = gr.Textbox(show_label=False, info='Override the input text with custom
179 |
with gr.Row():
180 |
clear_btn = gr.ClearButton(in_ps)
181 |
phonemize_btn = gr.Button('
182 |, inputs=[text, voice], outputs=[in_ps])
183 |
with gr.Column():
184 |
audio = gr.Audio(interactive=False, label='Output Audio')
185 |
with gr.Accordion('Tokens', open=True):
186 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio. Same as input
187 |
with gr.Accordion('
188 |
with gr.Row():
189 |
reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
190 |
with gr.Row():
191 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The
192 |
with gr.Row():
193 |
with gr.Column():
194 |
opening_cut = gr.Slider(minimum=0, maximum=24000, value=
195 |
with gr.Column():
196 |
closing_cut = gr.Slider(minimum=0, maximum=24000, value=
197 |
with gr.Row():
198 |
with gr.Column():
199 |
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
200 |
with gr.Column():
201 |
ease_out = gr.Slider(minimum=0, maximum=24000, value=
202 |, inputs=[text, voice, in_ps, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out], outputs=[audio, out_ps])
203 |
204 |
if __name__ == '__main__':
205 |
7 |
import os
8 |
import phonemizer
9 |
import random
10 |
import re
11 |
import spaces
12 |
import torch
13 |
import yaml
33 |
text = text.replace('Mr.', 'Mister')
34 |
text = text.replace('Ms.', 'Miss')
35 |
text = text.replace('Mrs.', 'Mrs')
36 |
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
37 |
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
38 |
text = re.sub(r'[^\S \n]', ' ', text)
39 |
text = re.sub(r' +', ' ', text)
40 |
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
41 |
return parens_to_angles(text).strip()
42 |
43 |
phonemizers = dict(
44 |
a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
45 |
b=phonemizer.backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
46 |
47 |
48 |
49 |
def phonemize(text, voice, norm=True):
50 |
lang = voice[0]
51 |
if norm:
52 |
text = normalize(text)
53 |
ps = phonemizers[lang].phonemize([text])
54 |
ps = ps[0] if ps else ''
55 |
# TODO: Custom phonemization rules?
57 |
58 |
ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
59 |
ps = ''.join(filter(lambda p: p in VOCAB, ps))
60 |
if lang == 'j' and any(p in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' for p in ps):
61 |
gr.Warning('Japanese tokenizer does not handle English letters.')
62 |
return ps.strip()
63 |
64 |
def length_to_mask(lengths):
78 |
return dicts
79 |
80 |
VOCAB = get_vocab()
81 |
82 |
def tokenize(ps):
83 |
return [i for i in map(VOCAB.get, ps) if i is not None]
84 |
85 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
86 |
87 |
snapshot = snapshot_download(repo_id='hexgrad/kokoro', allow_patterns=['*.pt', '*.pth', '*.yml'], use_auth_token=os.environ['TOKEN'])
88 |
config = yaml.safe_load(open(os.path.join(snapshot, 'config.yml')))
89 |
model = build_model(config['model_params'])
90 |
for key, value in model.items():
91 |
for module in value.children():
92 |
if isinstance(module, torch.nn.RNNBase):
93 |
94 |
_ = [model[key].eval() for key in model]
95 |
_ = [model[key].to(device) for key in model]
96 |
for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_location='cpu', weights_only=True)['net'].items():
130 |
131 |
132 |
133 |
134 |
135 |
def forward(tokens, voice, speed):
136 |
ref_s = VOICES[voice]
137 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
138 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
139 |
text_mask = length_to_mask(input_lengths).to(device)
140 |
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
141 |
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
142 |
s = ref_s[:, 128:]
143 |
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
144 |
x, _ = model.predictor.lstm(d)
145 |
duration = model.predictor.duration_proj(x)
146 |
duration = torch.sigmoid(duration).sum(axis=-1) / speed
147 |
pred_dur = torch.round(duration).clamp(min=1).long()
148 |
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
149 |
c_frame = 0
150 |
for i in range(pred_aln_trg.size(0)):
151 |
pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
152 |
c_frame += pred_dur[0,i].item()
153 |
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
154 |
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
155 |
t_en = model.text_encoder(tokens, input_lengths, text_mask)
156 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
157 |
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
158 |
159 |
def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000):
160 |
ps = ps or phonemize(text, voice)
161 |
tokens = tokenize(ps)
162 |
if not tokens:
163 |
return (None, '')
164 |
elif len(tokens) > 510:
165 |
tokens = tokens[:510]
166 |
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
167 |
168 |
out = forward(tokens, voice, speed)
169 |
except gr.exceptions.Error as e:
170 |
raise gr.Error(e)
171 |
return (None, '')
172 |
if reduce_noise > 0:
173 |
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
174 |
opening_cut = max(0, int(opening_cut / speed))
176 |
out[:opening_cut] = 0
177 |
closing_cut = max(0, int(closing_cut / speed))
178 |
if closing_cut > 0:
179 |
out[-closing_cut:] = 0
180 |
ease_in = min(int(ease_in / speed), len(out)//2 - opening_cut)
181 |
for i in range(ease_in):
182 |
out[i+opening_cut] *= s_curve(i / ease_in)
185 |
out[-i-1-closing_cut] *= s_curve(i / ease_out)
186 |
return ((SAMPLE_RATE, out), ps)
187 |
188 |
with gr.Blocks() as basic_tts:
189 |
with gr.Row():
190 |
with gr.Column():
191 |
text = gr.Textbox(label='Input Text')
194 |
random_btn = gr.Button('Random Text', variant='secondary')
195 |
generate_btn = gr.Button('Generate', variant='primary')
196 |, inputs=[voice], outputs=[text])
197 |
with gr.Accordion('Input Tokens', open=False):
198 |
in_ps = gr.Textbox(show_label=False, info='Override the input text with custom phonemes. Leave this blank to automatically tokenize the input text instead.')
199 |
with gr.Row():
200 |
clear_btn = gr.ClearButton(in_ps)
201 |
phonemize_btn = gr.Button('Tokenize Input Text', variant='primary')
202 |, inputs=[text, voice], outputs=[in_ps])
203 |
with gr.Column():
204 |
audio = gr.Audio(interactive=False, label='Output Audio')
205 |
with gr.Accordion('Output Tokens', open=True):
206 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
207 |
with gr.Accordion('Audio Settings', open=False):
208 |
with gr.Row():
209 |
reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
210 |
with gr.Row():
211 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
212 |
with gr.Row():
213 |
with gr.Column():
214 |
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Zero out this many samples at the start.')
215 |
with gr.Column():
216 |
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Zero out this many samples at the end.')
217 |
with gr.Row():
218 |
with gr.Column():
219 |
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
220 |
with gr.Column():
221 |
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
222 |, inputs=[text, voice, in_ps, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out], outputs=[audio, out_ps])
223 |
224 |
225 |
226 |
def lf_forward(token_lists, voice, speed):
227 |
ref_s = VOICES[voice]
228 |
s = ref_s[:, 128:]
229 |
outs = []
230 |
for tokens in token_lists:
231 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
232 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
233 |
text_mask = length_to_mask(input_lengths).to(device)
234 |
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
235 |
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
236 |
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
237 |
x, _ = model.predictor.lstm(d)
238 |
duration = model.predictor.duration_proj(x)
239 |
duration = torch.sigmoid(duration).sum(axis=-1) / speed
240 |
pred_dur = torch.round(duration).clamp(min=1).long()
241 |
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
242 |
c_frame = 0
243 |
for i in range(pred_aln_trg.size(0)):
244 |
pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
245 |
c_frame += pred_dur[0,i].item()
246 |
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
247 |
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
248 |
t_en = model.text_encoder(tokens, input_lengths, text_mask)
249 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
250 |
outs.append(model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy())
251 |
return outs
252 |
253 |
def resplit_strings(arr):
254 |
# Handle edge cases
255 |
if not arr:
256 |
return '', ''
257 |
if len(arr) == 1:
258 |
return arr[0], ''
259 |
# Try each possible split point
260 |
min_diff = float('inf')
261 |
best_split = 0
262 |
# Calculate lengths when joined with spaces
263 |
lengths = [len(s) for s in arr]
264 |
spaces = len(arr) - 1 # Total spaces needed
265 |
# Try each split point
266 |
left_len = 0
267 |
right_len = sum(lengths) + spaces
268 |
for i in range(1, len(arr)):
269 |
# Add current word and space to left side
270 |
left_len += lengths[i-1] + (1 if i > 1 else 0)
271 |
# Remove current word and space from right side
272 |
right_len -= lengths[i-1] + 1
273 |
diff = abs(left_len - right_len)
274 |
if diff < min_diff:
275 |
min_diff = diff
276 |
best_split = i
277 |
# Join the strings with the best split point
278 |
return ' '.join(arr[:best_split]), ' '.join(arr[best_split:])
279 |
280 |
def recursive_split(text, voice):
281 |
if not text:
282 |
return []
283 |
tokens = phonemize(text, voice, norm=False)
284 |
if len(tokens) < 511:
285 |
return [(text, tokens, len(tokens))] if tokens else []
286 |
if ' ' not in text:
287 |
return []
288 |
for punctuation in ['!.?…', ':;', ',—']:
289 |
splits = re.split(f'(?:(?<=[{punctuation}])|(?<=[{punctuation}]["\'»])|(?<=[{punctuation}]["\'»]["\'»])) ', text)
290 |
if len(splits) > 1:
291 |
292 |
293 |
splits = None
294 |
splits = splits or text.split(' ')
295 |
a, b = resplit_strings(splits)
296 |
return recursive_split(a, voice) + recursive_split(b, voice)
297 |
298 |
def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2):
299 |
if skip_square_brackets:
300 |
text = re.sub(r'\[.*?\]', '', text)
301 |
texts = [t.strip() for t in re.split('\n{'+str(newline_split)+',}', normalize(text))] if newline_split > 0 else [normalize(text)]
302 |
segments = [row for t in texts for row in recursive_split(t, voice)]
303 |
return [(i, *row) for i, row in enumerate(segments)]
304 |
305 |
def lf_generate(segments, voice, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad=5000):
306 |
token_lists = list(map(tokenize, segments['Tokens']))
307 |
wavs = []
308 |
opening_cut = max(0, int(opening_cut / speed))
309 |
closing_cut = max(0, int(closing_cut / speed))
310 |
pad = max(0, int(pad / speed))
311 |
batch_size = 100
312 |
for i in range(0, len(token_lists), batch_size):
313 |
314 |
outs = lf_forward(token_lists[i:i+batch_size], voice, speed)
315 |
except gr.exceptions.Error as e:
316 |
if wavs:
317 |
318 |
319 |
raise gr.Error(e)
320 |
321 |
for out in outs:
322 |
if reduce_noise > 0:
323 |
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
324 |
if opening_cut > 0:
325 |
out[:opening_cut] = 0
326 |
if closing_cut > 0:
327 |
out[-closing_cut:] = 0
328 |
ease_in = min(int(ease_in / speed), len(out)//2 - opening_cut)
329 |
for i in range(ease_in):
330 |
out[i+opening_cut] *= s_curve(i / ease_in)
331 |
ease_out = min(int(ease_out / speed), len(out)//2 - closing_cut)
332 |
for i in range(ease_out):
333 |
out[-i-1-closing_cut] *= s_curve(i / ease_out)
334 |
if wavs and pad > 0:
335 |
336 |
337 |
return (SAMPLE_RATE, np.concatenate(wavs)) if wavs else None
338 |
339 |
def did_change_segments(segments):
340 |
x = len(segments) if segments['Length'].any() else 0
341 |
return [
342 |
gr.Button('Tokenize', variant='secondary' if x else 'primary'),
343 |
gr.Button(f'Generate x{x}', variant='primary' if x else 'secondary', interactive=x > 0),
344 |
345 |
346 |
with gr.Blocks() as lf_tts:
347 |
with gr.Row():
348 |
with gr.Column():
349 |
text = gr.Textbox(label='Input Text')
350 |
voice = gr.Dropdown(list(CHOICES.items()), label='Voice')
351 |
with gr.Accordion('Text Settings', open=False):
352 |
skip_square_brackets = gr.Checkbox(True, label='Skip [Square Brackets]', info='Recommended for academic papers, Wikipedia articles, or texts with citations.')
353 |
newline_split = gr.Number(2, label='Newline Split', info='Split the input text on this many newlines. Affects how the text is segmented.', precision=0, minimum=0)
354 |
with gr.Row():
355 |
segment_btn = gr.Button('Tokenize', variant='primary')
356 |
generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
357 |
with gr.Column():
358 |
audio = gr.Audio(interactive=False, label='Output Audio')
359 |
with gr.Accordion('Audio Settings', open=False):
360 |
with gr.Row():
361 |
reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
362 |
with gr.Row():
363 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
364 |
with gr.Row():
365 |
with gr.Column():
366 |
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Zero out this many samples at the start.')
367 |
with gr.Column():
368 |
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Zero out this many samples at the end.')
369 |
with gr.Row():
370 |
with gr.Column():
371 |
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
372 |
with gr.Column():
373 |
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
374 |
with gr.Row():
375 |
pad = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad', info='🔇 How many samples of silence to insert between segments.')
376 |
with gr.Row():
377 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
378 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
379 |
+, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
380 |
+, inputs=[segments, voice, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out, pad], outputs=[audio])
381 |
382 |
with gr.Blocks() as app:
383 |
384 |
[basic_tts, lf_tts],
385 |
['Basic TTS', 'Long-Form'],
386 |
387 |
388 |
if __name__ == '__main__':
389 |