Spaces:

LiKenun
/

ai-building-blocks

Running on Zero

App Files Files Community

ai-building-blocks / text_to_speech.py

LiKenun

Add documentation

5bebd85 about 1 month ago

raw

history blame

1.97 kB

	import gc
	from os import getenv
	import gradio as gr
	from transformers import pipeline
	from utils import spaces_gpu


	@spaces_gpu
	def text_to_speech(text: str) -> tuple[int, bytes]:
	"""Convert text to speech audio using a TTS (Text-to-Speech) model.

	This function uses a transformer pipeline to generate speech audio from
	text input. The model is loaded, inference is performed, and then cleaned
	up to free GPU memory.

	Args:
	text: Input text string to convert to speech.

	Returns:
	Tuple containing:
	- int: Sampling rate of the generated audio (e.g., 22050 Hz)
	- bytes: Raw audio data as bytes

	Note:
	- The model ID is determined by the TEXT_TO_SPEECH_MODEL environment variable.
	- Uses safetensors for secure model loading.
	- Automatically selects the best available device (CUDA/XPU/MPS/CPU).
	- Cleans up model and GPU memory after inference.
	- Returns audio in format compatible with Gradio Audio component.
	"""
	narrator = pipeline(
	"text-to-speech",
	getenv("TEXT_TO_SPEECH_MODEL"),
	model_kwargs={"use_safetensors": True} # Use safetensors to avoid torch.load restriction.
	)
	result = narrator(text)
	del narrator
	gc.collect()
	return (result["sampling_rate"], result["audio"][0])


	def create_text_to_speech_tab():
	"""Create the text-to-speech tab in the Gradio interface.

	This function sets up all UI components for text-to-speech generation,
	including input textbox, generate button, and output audio player.
	"""
	gr.Markdown("Generate speech from text.")
	text_to_speech_text = gr.Textbox(label="Text")
	text_to_speech_generate_button = gr.Button("Generate")
	text_to_speech_output = gr.Audio(label="Speech")
	text_to_speech_generate_button.click(
	fn=text_to_speech,
	inputs=text_to_speech_text,
	outputs=text_to_speech_output
	)