benjamin-paine commited on
Commit
31b7f65
·
verified ·
1 Parent(s): 3d5d0d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -4
app.py CHANGED
@@ -126,6 +126,12 @@ def generate_audio(
126
  skip_speaking_rate: bool,
127
  skip_emotion: bool,
128
  skip_speaker: bool,
 
 
 
 
 
 
129
  progress=gr.Progress(),
130
  ) -> Tuple[Tuple[int, np.ndarray[Any, Any]], int]:
131
  """
@@ -142,9 +148,15 @@ def generate_audio(
142
  try:
143
  wav_out = selected_pipeline(
144
  text=text,
 
145
  language=language,
146
  reference_audio=speaker_audio,
 
 
 
147
  prefix_audio=prefix_audio,
 
 
148
  seed=seed,
149
  max_chunk_length=max_chunk_length,
150
  cross_fade_duration=cross_fade_duration,
@@ -176,7 +188,13 @@ def generate_audio(
176
  output_format="float",
177
  )
178
 
179
- return (44100, wav_out.squeeze().numpy()), seed
 
 
 
 
 
 
180
  finally:
181
  selected_pipeline.off_progress()
182
 
@@ -186,6 +204,7 @@ if __name__ == "__main__":
186
  with gr.Row():
187
  with gr.Column(scale=3):
188
  gr.Markdown(header_markdown)
 
189
  gr.Image(
190
  value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
191
  container=False,
@@ -207,6 +226,10 @@ if __name__ == "__main__":
207
  value="en-us",
208
  label="Language",
209
  )
 
 
 
 
210
 
211
  with gr.Row():
212
  if not is_hf_spaces:
@@ -260,6 +283,8 @@ if __name__ == "__main__":
260
  label="Optional Prefix Audio (continue from this audio)",
261
  type="filepath",
262
  )
 
 
263
  with gr.Column(scale=3):
264
  cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
265
  min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
@@ -274,8 +299,29 @@ if __name__ == "__main__":
274
  with gr.Row(variant="panel", equal_height=True) as speaker_row:
275
  with gr.Column():
276
  speaker_uncond = gr.Checkbox(label="Skip Speaker")
277
- speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker", value=False)
278
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  speaker_audio = gr.Audio(
280
  label="Optional Speaker Audio (for cloning)",
281
  type="filepath",
@@ -398,8 +444,14 @@ if __name__ == "__main__":
398
  speaking_rate_uncond,
399
  emotion_uncond,
400
  speaker_uncond,
 
 
 
 
 
 
401
  ],
402
  outputs=[output_audio, seed_number],
403
  )
404
-
405
  demo.launch()
 
126
  skip_speaking_rate: bool,
127
  skip_emotion: bool,
128
  skip_speaker: bool,
129
+ speaker_pitch_shift: float,
130
+ speaker_equalize: bool,
131
+ speaker_enhance: bool,
132
+ prefix_equalize: bool,
133
+ prefix_enhance: bool,
134
+ enhance: bool,
135
  progress=gr.Progress(),
136
  ) -> Tuple[Tuple[int, np.ndarray[Any, Any]], int]:
137
  """
 
148
  try:
149
  wav_out = selected_pipeline(
150
  text=text,
151
+ enhance=enhance,
152
  language=language,
153
  reference_audio=speaker_audio,
154
+ reference_audio_pitch_shift=speaker_pitch_shift,
155
+ equalize_reference_audio=speaker_equalize,
156
+ enhance_reference_audio=speaker_enhance,
157
  prefix_audio=prefix_audio,
158
+ equalize_prefix_audio=prefix_equalize,
159
+ enhance_prefix_audio=prefix_enhance,
160
  seed=seed,
161
  max_chunk_length=max_chunk_length,
162
  cross_fade_duration=cross_fade_duration,
 
188
  output_format="float",
189
  )
190
 
191
+ return (
192
+ (
193
+ 48000 if enhance else 44100,
194
+ wav_out.squeeze().numpy()
195
+ ),
196
+ seed
197
+ )
198
  finally:
199
  selected_pipeline.off_progress()
200
 
 
204
  with gr.Row():
205
  with gr.Column(scale=3):
206
  gr.Markdown(header_markdown)
207
+
208
  gr.Image(
209
  value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
210
  container=False,
 
226
  value="en-us",
227
  label="Language",
228
  )
229
+ enhanced_checkbox = gr.Checkbox(
230
+ value=True,
231
+ label="Enhance Output with DeepFilterNet"
232
+ )
233
 
234
  with gr.Row():
235
  if not is_hf_spaces:
 
283
  label="Optional Prefix Audio (continue from this audio)",
284
  type="filepath",
285
  )
286
+ prefix_equalized_checkbox = gr.Checkbox(label="Equalize Prefix Audio", value=True)
287
+ prefix_enhanced_checkbox = gr.Checkbox(label="Enhance Prefix Audio with DeepFilterNet", value=True)
288
  with gr.Column(scale=3):
289
  cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
290
  min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
 
299
  with gr.Row(variant="panel", equal_height=True) as speaker_row:
300
  with gr.Column():
301
  speaker_uncond = gr.Checkbox(label="Skip Speaker")
302
+ speaker_noised_checkbox = gr.Checkbox(
303
+ label="Denoise Speaker",
304
+ value=False,
305
+ info="Note; this is not a pre-processing step, it is a conditioning value that the model understands. Check this box if your input audio is noisy."
306
+ )
307
+ speaker_equalized_checkbox = gr.Checkbox(label="Equalize Speaker Audio", value=True)
308
+ speaker_enhanced_checkbox = gr.Checkbox(label="Enhance Speaker Audio with DeepFilterNet", value=True)
309
+
310
+ def on_enhanced_change(use_enhance: bool) -> Dict[str, Any]:
311
+ update_dict = {"enabled": not use_enhance}
312
+ if use_enhance:
313
+ update_dict["value"] = False
314
+ return gr.update(**update_dict)
315
+
316
+ speaker_enhance_checkbox.change(
317
+ fn=on_enhanced_change,
318
+ inputs=[speaker_enhance_checkbox],
319
+ outputs=[speaker_noised_checkbox]
320
+ )
321
+ speaker_pitch_shift = gr.Slider(
322
+ -1200, 1200, -44.99, 0.01, label="Speaker Pitch Shift (Cents)",
323
+ info="A pitch shift to apply to speaker audio before extracting embeddings. A slight down-shift of ~45 cents tends to produce a more accurate voice cloning."
324
+ )
325
  speaker_audio = gr.Audio(
326
  label="Optional Speaker Audio (for cloning)",
327
  type="filepath",
 
444
  speaking_rate_uncond,
445
  emotion_uncond,
446
  speaker_uncond,
447
+ speaker_pitch_shift,
448
+ speaker_equalized_checkbox,
449
+ speaker_enhanced_checkbox,
450
+ prefix_equalized_checkbox,
451
+ prefix_enhanced_checkbox,
452
+ enhanced_checkbox,
453
  ],
454
  outputs=[output_audio, seed_number],
455
  )
456
+
457
  demo.launch()