Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,048 Bytes
caf2559 1c1b97a 39d9406 caf2559 02c9b64 caf2559 1c1b97a 5bebd85 1c1b97a 5bebd85 4c71b8b 1c1b97a 4c71b8b caf2559 39d9406 55d79e2 5bebd85 55d79e2 5bebd85 39d9406 55d79e2 39d9406 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import gc
from functools import partial
import gradio as gr
from transformers import pipeline
from utils import spaces_gpu
@spaces_gpu
def text_to_speech(model: str, text: str) -> tuple[int, bytes]:
"""Convert text to speech audio using a TTS (Text-to-Speech) model.
This function uses a transformer pipeline to generate speech audio from
text input. The model is loaded, inference is performed, and then cleaned
up to free GPU memory.
Args:
model: Hugging Face model ID to use for text-to-speech.
text: Input text string to convert to speech.
Returns:
Tuple containing:
- int: Sampling rate of the generated audio (e.g., 22050 Hz)
- bytes: Raw audio data as bytes
Note:
- Uses safetensors for secure model loading.
- Automatically selects the best available device (CUDA/XPU/MPS/CPU).
- Cleans up model and GPU memory after inference.
- Returns audio in format compatible with Gradio Audio component.
"""
narrator = pipeline(
"text-to-speech",
model,
model_kwargs={"use_safetensors": True} # Use safetensors to avoid torch.load restriction.
)
result = narrator(text)
del narrator
gc.collect()
return (result["sampling_rate"], result["audio"][0])
def create_text_to_speech_tab(model: str):
"""Create the text-to-speech tab in the Gradio interface.
This function sets up all UI components for text-to-speech generation,
including input textbox, generate button, and output audio player.
Args:
model: Hugging Face model ID to use for text-to-speech.
"""
gr.Markdown("Generate speech from text.")
text_to_speech_text = gr.Textbox(label="Text")
text_to_speech_generate_button = gr.Button("Generate")
text_to_speech_output = gr.Audio(label="Speech")
text_to_speech_generate_button.click(
fn=partial(text_to_speech, model),
inputs=text_to_speech_text,
outputs=text_to_speech_output
)
|