Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -126,6 +126,12 @@ def generate_audio(
|
|
126 |
skip_speaking_rate: bool,
|
127 |
skip_emotion: bool,
|
128 |
skip_speaker: bool,
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
progress=gr.Progress(),
|
130 |
) -> Tuple[Tuple[int, np.ndarray[Any, Any]], int]:
|
131 |
"""
|
@@ -142,9 +148,15 @@ def generate_audio(
|
|
142 |
try:
|
143 |
wav_out = selected_pipeline(
|
144 |
text=text,
|
|
|
145 |
language=language,
|
146 |
reference_audio=speaker_audio,
|
|
|
|
|
|
|
147 |
prefix_audio=prefix_audio,
|
|
|
|
|
148 |
seed=seed,
|
149 |
max_chunk_length=max_chunk_length,
|
150 |
cross_fade_duration=cross_fade_duration,
|
@@ -176,7 +188,13 @@ def generate_audio(
|
|
176 |
output_format="float",
|
177 |
)
|
178 |
|
179 |
-
return (
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
finally:
|
181 |
selected_pipeline.off_progress()
|
182 |
|
@@ -186,6 +204,7 @@ if __name__ == "__main__":
|
|
186 |
with gr.Row():
|
187 |
with gr.Column(scale=3):
|
188 |
gr.Markdown(header_markdown)
|
|
|
189 |
gr.Image(
|
190 |
value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
|
191 |
container=False,
|
@@ -207,6 +226,10 @@ if __name__ == "__main__":
|
|
207 |
value="en-us",
|
208 |
label="Language",
|
209 |
)
|
|
|
|
|
|
|
|
|
210 |
|
211 |
with gr.Row():
|
212 |
if not is_hf_spaces:
|
@@ -260,6 +283,8 @@ if __name__ == "__main__":
|
|
260 |
label="Optional Prefix Audio (continue from this audio)",
|
261 |
type="filepath",
|
262 |
)
|
|
|
|
|
263 |
with gr.Column(scale=3):
|
264 |
cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
|
265 |
min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
|
@@ -274,8 +299,29 @@ if __name__ == "__main__":
|
|
274 |
with gr.Row(variant="panel", equal_height=True) as speaker_row:
|
275 |
with gr.Column():
|
276 |
speaker_uncond = gr.Checkbox(label="Skip Speaker")
|
277 |
-
speaker_noised_checkbox = gr.Checkbox(
|
278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
speaker_audio = gr.Audio(
|
280 |
label="Optional Speaker Audio (for cloning)",
|
281 |
type="filepath",
|
@@ -398,8 +444,14 @@ if __name__ == "__main__":
|
|
398 |
speaking_rate_uncond,
|
399 |
emotion_uncond,
|
400 |
speaker_uncond,
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
],
|
402 |
outputs=[output_audio, seed_number],
|
403 |
)
|
404 |
-
|
405 |
demo.launch()
|
|
|
126 |
skip_speaking_rate: bool,
|
127 |
skip_emotion: bool,
|
128 |
skip_speaker: bool,
|
129 |
+
speaker_pitch_shift: float,
|
130 |
+
speaker_equalize: bool,
|
131 |
+
speaker_enhance: bool,
|
132 |
+
prefix_equalize: bool,
|
133 |
+
prefix_enhance: bool,
|
134 |
+
enhance: bool,
|
135 |
progress=gr.Progress(),
|
136 |
) -> Tuple[Tuple[int, np.ndarray[Any, Any]], int]:
|
137 |
"""
|
|
|
148 |
try:
|
149 |
wav_out = selected_pipeline(
|
150 |
text=text,
|
151 |
+
enhance=enhance,
|
152 |
language=language,
|
153 |
reference_audio=speaker_audio,
|
154 |
+
reference_audio_pitch_shift=speaker_pitch_shift,
|
155 |
+
equalize_reference_audio=speaker_equalize,
|
156 |
+
enhance_reference_audio=speaker_enhance,
|
157 |
prefix_audio=prefix_audio,
|
158 |
+
equalize_prefix_audio=prefix_equalize,
|
159 |
+
enhance_prefix_audio=prefix_enhance,
|
160 |
seed=seed,
|
161 |
max_chunk_length=max_chunk_length,
|
162 |
cross_fade_duration=cross_fade_duration,
|
|
|
188 |
output_format="float",
|
189 |
)
|
190 |
|
191 |
+
return (
|
192 |
+
(
|
193 |
+
48000 if enhance else 44100,
|
194 |
+
wav_out.squeeze().numpy()
|
195 |
+
),
|
196 |
+
seed
|
197 |
+
)
|
198 |
finally:
|
199 |
selected_pipeline.off_progress()
|
200 |
|
|
|
204 |
with gr.Row():
|
205 |
with gr.Column(scale=3):
|
206 |
gr.Markdown(header_markdown)
|
207 |
+
|
208 |
gr.Image(
|
209 |
value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
|
210 |
container=False,
|
|
|
226 |
value="en-us",
|
227 |
label="Language",
|
228 |
)
|
229 |
+
enhanced_checkbox = gr.Checkbox(
|
230 |
+
value=True,
|
231 |
+
label="Enhance Output with DeepFilterNet"
|
232 |
+
)
|
233 |
|
234 |
with gr.Row():
|
235 |
if not is_hf_spaces:
|
|
|
283 |
label="Optional Prefix Audio (continue from this audio)",
|
284 |
type="filepath",
|
285 |
)
|
286 |
+
prefix_equalized_checkbox = gr.Checkbox(label="Equalize Prefix Audio", value=True)
|
287 |
+
prefix_enhanced_checkbox = gr.Checkbox(label="Enhance Prefix Audio with DeepFilterNet", value=True)
|
288 |
with gr.Column(scale=3):
|
289 |
cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
|
290 |
min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
|
|
|
299 |
with gr.Row(variant="panel", equal_height=True) as speaker_row:
|
300 |
with gr.Column():
|
301 |
speaker_uncond = gr.Checkbox(label="Skip Speaker")
|
302 |
+
speaker_noised_checkbox = gr.Checkbox(
|
303 |
+
label="Denoise Speaker",
|
304 |
+
value=False,
|
305 |
+
info="Note; this is not a pre-processing step, it is a conditioning value that the model understands. Check this box if your input audio is noisy."
|
306 |
+
)
|
307 |
+
speaker_equalized_checkbox = gr.Checkbox(label="Equalize Speaker Audio", value=True)
|
308 |
+
speaker_enhanced_checkbox = gr.Checkbox(label="Enhance Speaker Audio with DeepFilterNet", value=True)
|
309 |
+
|
310 |
+
def on_enhanced_change(use_enhance: bool) -> Dict[str, Any]:
|
311 |
+
update_dict = {"enabled": not use_enhance}
|
312 |
+
if use_enhance:
|
313 |
+
update_dict["value"] = False
|
314 |
+
return gr.update(**update_dict)
|
315 |
+
|
316 |
+
speaker_enhance_checkbox.change(
|
317 |
+
fn=on_enhanced_change,
|
318 |
+
inputs=[speaker_enhance_checkbox],
|
319 |
+
outputs=[speaker_noised_checkbox]
|
320 |
+
)
|
321 |
+
speaker_pitch_shift = gr.Slider(
|
322 |
+
-1200, 1200, -44.99, 0.01, label="Speaker Pitch Shift (Cents)",
|
323 |
+
info="A pitch shift to apply to speaker audio before extracting embeddings. A slight down-shift of ~45 cents tends to produce a more accurate voice cloning."
|
324 |
+
)
|
325 |
speaker_audio = gr.Audio(
|
326 |
label="Optional Speaker Audio (for cloning)",
|
327 |
type="filepath",
|
|
|
444 |
speaking_rate_uncond,
|
445 |
emotion_uncond,
|
446 |
speaker_uncond,
|
447 |
+
speaker_pitch_shift,
|
448 |
+
speaker_equalized_checkbox,
|
449 |
+
speaker_enhanced_checkbox,
|
450 |
+
prefix_equalized_checkbox,
|
451 |
+
prefix_enhanced_checkbox,
|
452 |
+
enhanced_checkbox,
|
453 |
],
|
454 |
outputs=[output_audio, seed_number],
|
455 |
)
|
456 |
+
|
457 |
demo.launch()
|