ai-building-blocks / text_to_speech.py
LiKenun's picture
Add documentation
5bebd85
raw
history blame
1.97 kB
import gc
from os import getenv
import gradio as gr
from transformers import pipeline
from utils import spaces_gpu
@spaces_gpu
def text_to_speech(text: str) -> tuple[int, bytes]:
"""Convert text to speech audio using a TTS (Text-to-Speech) model.
This function uses a transformer pipeline to generate speech audio from
text input. The model is loaded, inference is performed, and then cleaned
up to free GPU memory.
Args:
text: Input text string to convert to speech.
Returns:
Tuple containing:
- int: Sampling rate of the generated audio (e.g., 22050 Hz)
- bytes: Raw audio data as bytes
Note:
- The model ID is determined by the TEXT_TO_SPEECH_MODEL environment variable.
- Uses safetensors for secure model loading.
- Automatically selects the best available device (CUDA/XPU/MPS/CPU).
- Cleans up model and GPU memory after inference.
- Returns audio in format compatible with Gradio Audio component.
"""
narrator = pipeline(
"text-to-speech",
getenv("TEXT_TO_SPEECH_MODEL"),
model_kwargs={"use_safetensors": True} # Use safetensors to avoid torch.load restriction.
)
result = narrator(text)
del narrator
gc.collect()
return (result["sampling_rate"], result["audio"][0])
def create_text_to_speech_tab():
"""Create the text-to-speech tab in the Gradio interface.
This function sets up all UI components for text-to-speech generation,
including input textbox, generate button, and output audio player.
"""
gr.Markdown("Generate speech from text.")
text_to_speech_text = gr.Textbox(label="Text")
text_to_speech_generate_button = gr.Button("Generate")
text_to_speech_output = gr.Audio(label="Speech")
text_to_speech_generate_button.click(
fn=text_to_speech,
inputs=text_to_speech_text,
outputs=text_to_speech_output
)