Spaces:

LiKenun
/

ai-building-blocks

Running on Zero

File size: 2,048 Bytes

caf2559
1c1b97a
39d9406
caf2559
 
 
02c9b64
caf2559
1c1b97a
5bebd85
 
 
 
 
 
 
1c1b97a
5bebd85
 
 
 
 
 
 
 
 
 
 
 
 
4c71b8b
 
1c1b97a
4c71b8b
 
 
caf2559
 
 
39d9406
 
55d79e2
5bebd85
 
 
 
55d79e2
 
 
5bebd85
39d9406
 
 
 
 
55d79e2
39d9406

import gc
from functools import partial
import gradio as gr
from transformers import pipeline
from utils import spaces_gpu


@spaces_gpu
def text_to_speech(model: str, text: str) -> tuple[int, bytes]:
    """Convert text to speech audio using a TTS (Text-to-Speech) model.
    
    This function uses a transformer pipeline to generate speech audio from
    text input. The model is loaded, inference is performed, and then cleaned
    up to free GPU memory.
    
    Args:
        model: Hugging Face model ID to use for text-to-speech.
        text: Input text string to convert to speech.
    
    Returns:
        Tuple containing:
            - int: Sampling rate of the generated audio (e.g., 22050 Hz)
            - bytes: Raw audio data as bytes
    
    Note:
        - Uses safetensors for secure model loading.
        - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
        - Cleans up model and GPU memory after inference.
        - Returns audio in format compatible with Gradio Audio component.
    """
    narrator = pipeline(
        "text-to-speech",
        model,
        model_kwargs={"use_safetensors": True} # Use safetensors to avoid torch.load restriction.
    )
    result = narrator(text)
    del narrator
    gc.collect()
    return (result["sampling_rate"], result["audio"][0])


def create_text_to_speech_tab(model: str):
    """Create the text-to-speech tab in the Gradio interface.
    
    This function sets up all UI components for text-to-speech generation,
    including input textbox, generate button, and output audio player.
    
    Args:
        model: Hugging Face model ID to use for text-to-speech.
    """
    gr.Markdown("Generate speech from text.")
    text_to_speech_text = gr.Textbox(label="Text")
    text_to_speech_generate_button = gr.Button("Generate")
    text_to_speech_output = gr.Audio(label="Speech")
    text_to_speech_generate_button.click(
        fn=partial(text_to_speech, model),
        inputs=text_to_speech_text,
        outputs=text_to_speech_output
    )