ai-building-blocks / text_to_speech.py
LiKenun's picture
Switch to use GPU instead of inference client
5c395b2
raw
history blame
2.71 kB
import gc
from functools import partial
import gradio as gr
import torch
from transformers import pipeline
from utils import get_pytorch_device, spaces_gpu, get_torch_dtype
@spaces_gpu
def text_to_speech(model: str, text: str) -> tuple[int, bytes]:
"""Convert text to speech audio using a TTS (Text-to-Speech) model.
This function uses a transformer pipeline to generate speech audio from
text input. The model is loaded, inference is performed, and then cleaned
up to free GPU memory.
Args:
model: Hugging Face model ID to use for text-to-speech.
text: Input text string to convert to speech.
Returns:
Tuple containing:
- int: Sampling rate of the generated audio (e.g., 22050 Hz)
- bytes: Raw audio data as bytes
Note:
- Uses safetensors for secure model loading.
- Automatically selects the best available device (CUDA/XPU/MPS/CPU).
- Cleans up model and GPU memory after inference.
- Returns audio in format compatible with Gradio Audio component.
"""
pytorch_device = get_pytorch_device()
dtype = get_torch_dtype()
# During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
# reduces memory consumption by not storing gradients. This can significantly reduce the
# amount of memory used during the inference phase.
model_kwargs = {"use_safetensors": True} # Use safetensors to avoid torch.load restriction.
if dtype is not None:
model_kwargs["dtype"] = dtype
narrator = pipeline(
"text-to-speech",
model,
device=0 if pytorch_device == "cuda" else -1,
model_kwargs=model_kwargs
)
with torch.no_grad():
result = narrator(text)
# Clean up GPU memory
del narrator
if pytorch_device == "cuda":
torch.cuda.empty_cache()
gc.collect()
return (result["sampling_rate"], result["audio"][0])
def create_text_to_speech_tab(model: str):
"""Create the text-to-speech tab in the Gradio interface.
This function sets up all UI components for text-to-speech generation,
including input textbox, generate button, and output audio player.
Args:
model: Hugging Face model ID to use for text-to-speech.
"""
gr.Markdown("Generate speech from text.")
text_to_speech_text = gr.Textbox(label="Text")
text_to_speech_generate_button = gr.Button("Generate")
text_to_speech_output = gr.Audio(label="Speech")
text_to_speech_generate_button.click(
fn=partial(text_to_speech, model),
inputs=text_to_speech_text,
outputs=text_to_speech_output
)