Usage

Install Libraries

torch
soundfile
transformers
datasets>=3.5.0,<4.0.0
numpy==1.26.4
sentencepiece>=0.2.0

Using below Python script fro Inference

import torch
import soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import numpy as np
import json
import os

# Load processor, model, and vocoder
processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
model = SpeechT5ForTextToSpeech.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

def generate_speech(text, voice, output_path="tests/test_output/tts_output.wav"):
    print(f"Generating speech for text: {text}, voice: {voice}, output: {output_path}")
    if not text or not voice:
        return None, "Please provide both text and voice selection."
    
    speaker_dict = {"male": 2000, "female": 7000}
    try:
        speaker_id = speaker_dict[voice.lower()]
        speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0)
        inputs = processor(text=text, return_tensors="pt")
        
        with torch.no_grad():
            speech = model.generate_speech(
                inputs["input_ids"],
                speaker_embeddings=speaker_embedding,
                vocoder=vocoder,
                attention_mask=inputs.get("attention_mask")
            )
        
        sf.write(output_path, speech.numpy(), samplerate=16000)
        print(f"Audio saved to {output_path}")
        return output_path, None
    except Exception as e:
        print(f"Error generating speech: {str(e)}")
        return None, f"Error generating speech: {str(e)}"

text = "<input_text>"
voice = "Female" # choose "Male" or "Female"
generate_speech(text, voice, output_path="tests/test_output/tts_output.wav")

Show the Output TTS Audio

from IPython.display import Audio
Audio("tests/test_output/tts_output.wav")

Downloads last month: 127

Safetensors

Model size

144M params

Tensor type

F32

Model tree for danhtran2mind/Viet-SpeechT5-TTS-finetuning

Base model

microsoft/speecht5_tts

Finetuned

(1240)

this model

Space using danhtran2mind/Viet-SpeechT5-TTS-finetuning 1

Collection including danhtran2mind/Viet-SpeechT5-TTS-finetuning

DanhTran2Mind's TTS

Collection

3 items • Updated Jul 22