hexgrad commited on
Commit
7c600ad
·
verified ·
1 Parent(s): 38d6820

Upload 3 files

Browse files
Files changed (1) hide show
  1. app.py +214 -30
app.py CHANGED
@@ -7,6 +7,7 @@ import numpy as np
7
  import os
8
  import phonemizer
9
  import random
 
10
  import spaces
11
  import torch
12
  import yaml
@@ -32,17 +33,23 @@ def normalize(text):
32
  text = text.replace('Mr.', 'Mister')
33
  text = text.replace('Ms.', 'Miss')
34
  text = text.replace('Mrs.', 'Mrs')
35
- return parens_to_angles(text)
 
 
 
 
 
36
 
37
  phonemizers = dict(
38
  a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
39
  b=phonemizer.backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
40
- j=Katsu()
41
  )
42
 
43
- def phonemize(text, voice):
44
  lang = voice[0]
45
- text = normalize(text)
 
46
  ps = phonemizers[lang].phonemize([text])
47
  ps = ps[0] if ps else ''
48
  # TODO: Custom phonemization rules?
@@ -50,6 +57,8 @@ def phonemize(text, voice):
50
  # https://en.wiktionary.org/wiki/kokoro#English
51
  ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
52
  ps = ''.join(filter(lambda p: p in VOCAB, ps))
 
 
53
  return ps.strip()
54
 
55
  def length_to_mask(lengths):
@@ -69,11 +78,19 @@ def get_vocab():
69
  return dicts
70
 
71
  VOCAB = get_vocab()
 
 
 
 
72
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
73
 
74
  snapshot = snapshot_download(repo_id='hexgrad/kokoro', allow_patterns=['*.pt', '*.pth', '*.yml'], use_auth_token=os.environ['TOKEN'])
75
  config = yaml.safe_load(open(os.path.join(snapshot, 'config.yml')))
76
  model = build_model(config['model_params'])
 
 
 
 
77
  _ = [model[key].eval() for key in model]
78
  _ = [model[key].to(device) for key in model]
79
  for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_location='cpu', weights_only=True)['net'].items():
@@ -113,42 +130,45 @@ def s_curve(p):
113
 
114
  SAMPLE_RATE = 24000
115
 
116
- @spaces.GPU(duration=10)
117
  @torch.no_grad()
118
  def forward(tokens, voice, speed):
 
119
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
120
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
121
  text_mask = length_to_mask(input_lengths).to(device)
122
  bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
123
  d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
124
- ref_s = VOICES[voice]
125
  s = ref_s[:, 128:]
126
  d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
127
  x, _ = model.predictor.lstm(d)
128
  duration = model.predictor.duration_proj(x)
129
  duration = torch.sigmoid(duration).sum(axis=-1) / speed
130
- pred_dur = torch.round(duration.squeeze()).clamp(min=1)
131
- pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
132
  c_frame = 0
133
  for i in range(pred_aln_trg.size(0)):
134
- pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
135
- c_frame += int(pred_dur[i].data)
136
- en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
137
  F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
138
  t_en = model.text_encoder(tokens, input_lengths, text_mask)
139
- asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
140
- out = model.decoder(asr, F0_pred, N_pred, ref_s[:, :128])
141
- return out.squeeze().cpu().numpy()
142
 
143
- def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=5000, closing_cut=0, ease_in=3000, ease_out=0):
144
  ps = ps or phonemize(text, voice)
145
- tokens = [i for i in map(VOCAB.get, ps) if i is not None]
146
  if not tokens:
147
  return (None, '')
148
  elif len(tokens) > 510:
149
  tokens = tokens[:510]
150
  ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
151
- out = forward(tokens, voice, speed)
 
 
 
 
152
  if reduce_noise > 0:
153
  out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
154
  opening_cut = max(0, int(opening_cut / speed))
@@ -156,7 +176,7 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=5000
156
  out[:opening_cut] = 0
157
  closing_cut = max(0, int(closing_cut / speed))
158
  if closing_cut > 0:
159
- out = out[-closing_cut:] = 0
160
  ease_in = min(int(ease_in / speed), len(out)//2 - opening_cut)
161
  for i in range(ease_in):
162
  out[i+opening_cut] *= s_curve(i / ease_in)
@@ -165,7 +185,7 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=5000
165
  out[-i-1-closing_cut] *= s_curve(i / ease_out)
166
  return ((SAMPLE_RATE, out), ps)
167
 
168
- with gr.Blocks() as demo:
169
  with gr.Row():
170
  with gr.Column():
171
  text = gr.Textbox(label='Input Text')
@@ -174,32 +194,196 @@ with gr.Blocks() as demo:
174
  random_btn = gr.Button('Random Text', variant='secondary')
175
  generate_btn = gr.Button('Generate', variant='primary')
176
  random_btn.click(get_random_text, inputs=[voice], outputs=[text])
177
- with gr.Accordion('Input Phonemes', open=False):
178
- in_ps = gr.Textbox(show_label=False, info='Override the input text with custom pronunciation. Leave this blank to use the input text instead.')
179
  with gr.Row():
180
  clear_btn = gr.ClearButton(in_ps)
181
- phonemize_btn = gr.Button('Phonemize Input Text', variant='primary')
182
  phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
183
  with gr.Column():
184
  audio = gr.Audio(interactive=False, label='Output Audio')
185
- with gr.Accordion('Tokens', open=True):
186
- out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio. Same as input phonemes if supplied, excluding unknown characters and truncated to 510 tokens.')
187
- with gr.Accordion('Advanced Settings', open=False):
188
  with gr.Row():
189
  reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
190
  with gr.Row():
191
- speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The trim settings below are also auto-scaled by speed.')
192
  with gr.Row():
193
  with gr.Column():
194
- opening_cut = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Opening Cut', info='✂️ Zero out this many samples at the start.')
195
  with gr.Column():
196
- closing_cut = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='Closing Cut', info='✂️ Zero out this many samples at the end.')
197
  with gr.Row():
198
  with gr.Column():
199
  ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
200
  with gr.Column():
201
- ease_out = gr.Slider(minimum=0, maximum=24000, value=0, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
202
  generate_btn.click(generate, inputs=[text, voice, in_ps, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out], outputs=[audio, out_ps])
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  if __name__ == '__main__':
205
- demo.launch()
 
7
  import os
8
  import phonemizer
9
  import random
10
+ import re
11
  import spaces
12
  import torch
13
  import yaml
 
33
  text = text.replace('Mr.', 'Mister')
34
  text = text.replace('Ms.', 'Miss')
35
  text = text.replace('Mrs.', 'Mrs')
36
+ text = text.replace(chr(8216), "'").replace(chr(8217), "'")
37
+ text = text.replace(chr(8220), '"').replace(chr(8221), '"')
38
+ text = re.sub(r'[^\S \n]', ' ', text)
39
+ text = re.sub(r' +', ' ', text)
40
+ text = re.sub(r'(?<=\n) +(?=\n)', '', text)
41
+ return parens_to_angles(text).strip()
42
 
43
  phonemizers = dict(
44
  a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
45
  b=phonemizer.backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
46
+ j=Katsu(),
47
  )
48
 
49
+ def phonemize(text, voice, norm=True):
50
  lang = voice[0]
51
+ if norm:
52
+ text = normalize(text)
53
  ps = phonemizers[lang].phonemize([text])
54
  ps = ps[0] if ps else ''
55
  # TODO: Custom phonemization rules?
 
57
  # https://en.wiktionary.org/wiki/kokoro#English
58
  ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
59
  ps = ''.join(filter(lambda p: p in VOCAB, ps))
60
+ if lang == 'j' and any(p in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' for p in ps):
61
+ gr.Warning('Japanese tokenizer does not handle English letters.')
62
  return ps.strip()
63
 
64
  def length_to_mask(lengths):
 
78
  return dicts
79
 
80
  VOCAB = get_vocab()
81
+
82
+ def tokenize(ps):
83
+ return [i for i in map(VOCAB.get, ps) if i is not None]
84
+
85
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
86
 
87
  snapshot = snapshot_download(repo_id='hexgrad/kokoro', allow_patterns=['*.pt', '*.pth', '*.yml'], use_auth_token=os.environ['TOKEN'])
88
  config = yaml.safe_load(open(os.path.join(snapshot, 'config.yml')))
89
  model = build_model(config['model_params'])
90
+ for key, value in model.items():
91
+ for module in value.children():
92
+ if isinstance(module, torch.nn.RNNBase):
93
+ module.flatten_parameters()
94
  _ = [model[key].eval() for key in model]
95
  _ = [model[key].to(device) for key in model]
96
  for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_location='cpu', weights_only=True)['net'].items():
 
130
 
131
  SAMPLE_RATE = 24000
132
 
133
+ @spaces.GPU(duration=1)
134
  @torch.no_grad()
135
  def forward(tokens, voice, speed):
136
+ ref_s = VOICES[voice]
137
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
138
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
139
  text_mask = length_to_mask(input_lengths).to(device)
140
  bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
141
  d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
 
142
  s = ref_s[:, 128:]
143
  d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
144
  x, _ = model.predictor.lstm(d)
145
  duration = model.predictor.duration_proj(x)
146
  duration = torch.sigmoid(duration).sum(axis=-1) / speed
147
+ pred_dur = torch.round(duration).clamp(min=1).long()
148
+ pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
149
  c_frame = 0
150
  for i in range(pred_aln_trg.size(0)):
151
+ pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
152
+ c_frame += pred_dur[0,i].item()
153
+ en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
154
  F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
155
  t_en = model.text_encoder(tokens, input_lengths, text_mask)
156
+ asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
157
+ return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
 
158
 
159
+ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000):
160
  ps = ps or phonemize(text, voice)
161
+ tokens = tokenize(ps)
162
  if not tokens:
163
  return (None, '')
164
  elif len(tokens) > 510:
165
  tokens = tokens[:510]
166
  ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
167
+ try:
168
+ out = forward(tokens, voice, speed)
169
+ except gr.exceptions.Error as e:
170
+ raise gr.Error(e)
171
+ return (None, '')
172
  if reduce_noise > 0:
173
  out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
174
  opening_cut = max(0, int(opening_cut / speed))
 
176
  out[:opening_cut] = 0
177
  closing_cut = max(0, int(closing_cut / speed))
178
  if closing_cut > 0:
179
+ out[-closing_cut:] = 0
180
  ease_in = min(int(ease_in / speed), len(out)//2 - opening_cut)
181
  for i in range(ease_in):
182
  out[i+opening_cut] *= s_curve(i / ease_in)
 
185
  out[-i-1-closing_cut] *= s_curve(i / ease_out)
186
  return ((SAMPLE_RATE, out), ps)
187
 
188
+ with gr.Blocks() as basic_tts:
189
  with gr.Row():
190
  with gr.Column():
191
  text = gr.Textbox(label='Input Text')
 
194
  random_btn = gr.Button('Random Text', variant='secondary')
195
  generate_btn = gr.Button('Generate', variant='primary')
196
  random_btn.click(get_random_text, inputs=[voice], outputs=[text])
197
+ with gr.Accordion('Input Tokens', open=False):
198
+ in_ps = gr.Textbox(show_label=False, info='Override the input text with custom phonemes. Leave this blank to automatically tokenize the input text instead.')
199
  with gr.Row():
200
  clear_btn = gr.ClearButton(in_ps)
201
+ phonemize_btn = gr.Button('Tokenize Input Text', variant='primary')
202
  phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
203
  with gr.Column():
204
  audio = gr.Audio(interactive=False, label='Output Audio')
205
+ with gr.Accordion('Output Tokens', open=True):
206
+ out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
207
+ with gr.Accordion('Audio Settings', open=False):
208
  with gr.Row():
209
  reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
210
  with gr.Row():
211
+ speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
212
  with gr.Row():
213
  with gr.Column():
214
+ opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Zero out this many samples at the start.')
215
  with gr.Column():
216
+ closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Zero out this many samples at the end.')
217
  with gr.Row():
218
  with gr.Column():
219
  ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
220
  with gr.Column():
221
+ ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
222
  generate_btn.click(generate, inputs=[text, voice, in_ps, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out], outputs=[audio, out_ps])
223
 
224
+ @spaces.GPU
225
+ @torch.no_grad()
226
+ def lf_forward(token_lists, voice, speed):
227
+ ref_s = VOICES[voice]
228
+ s = ref_s[:, 128:]
229
+ outs = []
230
+ for tokens in token_lists:
231
+ tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
232
+ input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
233
+ text_mask = length_to_mask(input_lengths).to(device)
234
+ bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
235
+ d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
236
+ d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
237
+ x, _ = model.predictor.lstm(d)
238
+ duration = model.predictor.duration_proj(x)
239
+ duration = torch.sigmoid(duration).sum(axis=-1) / speed
240
+ pred_dur = torch.round(duration).clamp(min=1).long()
241
+ pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
242
+ c_frame = 0
243
+ for i in range(pred_aln_trg.size(0)):
244
+ pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
245
+ c_frame += pred_dur[0,i].item()
246
+ en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
247
+ F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
248
+ t_en = model.text_encoder(tokens, input_lengths, text_mask)
249
+ asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
250
+ outs.append(model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy())
251
+ return outs
252
+
253
+ def resplit_strings(arr):
254
+ # Handle edge cases
255
+ if not arr:
256
+ return '', ''
257
+ if len(arr) == 1:
258
+ return arr[0], ''
259
+ # Try each possible split point
260
+ min_diff = float('inf')
261
+ best_split = 0
262
+ # Calculate lengths when joined with spaces
263
+ lengths = [len(s) for s in arr]
264
+ spaces = len(arr) - 1 # Total spaces needed
265
+ # Try each split point
266
+ left_len = 0
267
+ right_len = sum(lengths) + spaces
268
+ for i in range(1, len(arr)):
269
+ # Add current word and space to left side
270
+ left_len += lengths[i-1] + (1 if i > 1 else 0)
271
+ # Remove current word and space from right side
272
+ right_len -= lengths[i-1] + 1
273
+ diff = abs(left_len - right_len)
274
+ if diff < min_diff:
275
+ min_diff = diff
276
+ best_split = i
277
+ # Join the strings with the best split point
278
+ return ' '.join(arr[:best_split]), ' '.join(arr[best_split:])
279
+
280
+ def recursive_split(text, voice):
281
+ if not text:
282
+ return []
283
+ tokens = phonemize(text, voice, norm=False)
284
+ if len(tokens) < 511:
285
+ return [(text, tokens, len(tokens))] if tokens else []
286
+ if ' ' not in text:
287
+ return []
288
+ for punctuation in ['!.?…', ':;', ',—']:
289
+ splits = re.split(f'(?:(?<=[{punctuation}])|(?<=[{punctuation}]["\'»])|(?<=[{punctuation}]["\'»]["\'»])) ', text)
290
+ if len(splits) > 1:
291
+ break
292
+ else:
293
+ splits = None
294
+ splits = splits or text.split(' ')
295
+ a, b = resplit_strings(splits)
296
+ return recursive_split(a, voice) + recursive_split(b, voice)
297
+
298
+ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2):
299
+ if skip_square_brackets:
300
+ text = re.sub(r'\[.*?\]', '', text)
301
+ texts = [t.strip() for t in re.split('\n{'+str(newline_split)+',}', normalize(text))] if newline_split > 0 else [normalize(text)]
302
+ segments = [row for t in texts for row in recursive_split(t, voice)]
303
+ return [(i, *row) for i, row in enumerate(segments)]
304
+
305
+ def lf_generate(segments, voice, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad=5000):
306
+ token_lists = list(map(tokenize, segments['Tokens']))
307
+ wavs = []
308
+ opening_cut = max(0, int(opening_cut / speed))
309
+ closing_cut = max(0, int(closing_cut / speed))
310
+ pad = max(0, int(pad / speed))
311
+ batch_size = 100
312
+ for i in range(0, len(token_lists), batch_size):
313
+ try:
314
+ outs = lf_forward(token_lists[i:i+batch_size], voice, speed)
315
+ except gr.exceptions.Error as e:
316
+ if wavs:
317
+ gr.Warning(e)
318
+ else:
319
+ raise gr.Error(e)
320
+ break
321
+ for out in outs:
322
+ if reduce_noise > 0:
323
+ out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
324
+ if opening_cut > 0:
325
+ out[:opening_cut] = 0
326
+ if closing_cut > 0:
327
+ out[-closing_cut:] = 0
328
+ ease_in = min(int(ease_in / speed), len(out)//2 - opening_cut)
329
+ for i in range(ease_in):
330
+ out[i+opening_cut] *= s_curve(i / ease_in)
331
+ ease_out = min(int(ease_out / speed), len(out)//2 - closing_cut)
332
+ for i in range(ease_out):
333
+ out[-i-1-closing_cut] *= s_curve(i / ease_out)
334
+ if wavs and pad > 0:
335
+ wavs.append(np.zeros(pad))
336
+ wavs.append(out)
337
+ return (SAMPLE_RATE, np.concatenate(wavs)) if wavs else None
338
+
339
+ def did_change_segments(segments):
340
+ x = len(segments) if segments['Length'].any() else 0
341
+ return [
342
+ gr.Button('Tokenize', variant='secondary' if x else 'primary'),
343
+ gr.Button(f'Generate x{x}', variant='primary' if x else 'secondary', interactive=x > 0),
344
+ ]
345
+
346
+ with gr.Blocks() as lf_tts:
347
+ with gr.Row():
348
+ with gr.Column():
349
+ text = gr.Textbox(label='Input Text')
350
+ voice = gr.Dropdown(list(CHOICES.items()), label='Voice')
351
+ with gr.Accordion('Text Settings', open=False):
352
+ skip_square_brackets = gr.Checkbox(True, label='Skip [Square Brackets]', info='Recommended for academic papers, Wikipedia articles, or texts with citations.')
353
+ newline_split = gr.Number(2, label='Newline Split', info='Split the input text on this many newlines. Affects how the text is segmented.', precision=0, minimum=0)
354
+ with gr.Row():
355
+ segment_btn = gr.Button('Tokenize', variant='primary')
356
+ generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
357
+ with gr.Column():
358
+ audio = gr.Audio(interactive=False, label='Output Audio')
359
+ with gr.Accordion('Audio Settings', open=False):
360
+ with gr.Row():
361
+ reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
362
+ with gr.Row():
363
+ speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
364
+ with gr.Row():
365
+ with gr.Column():
366
+ opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Zero out this many samples at the start.')
367
+ with gr.Column():
368
+ closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Zero out this many samples at the end.')
369
+ with gr.Row():
370
+ with gr.Column():
371
+ ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
372
+ with gr.Column():
373
+ ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
374
+ with gr.Row():
375
+ pad = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad', info='🔇 How many samples of silence to insert between segments.')
376
+ with gr.Row():
377
+ segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
378
+ segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
379
+ segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
380
+ generate_btn.click(lf_generate, inputs=[segments, voice, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out, pad], outputs=[audio])
381
+
382
+ with gr.Blocks() as app:
383
+ gr.TabbedInterface(
384
+ [basic_tts, lf_tts],
385
+ ['Basic TTS', 'Long-Form'],
386
+ )
387
+
388
  if __name__ == '__main__':
389
+ app.launch()