DanhTran2Mind's TTS
Collection
3 items
โข
Updated
Install Libraries
torch
soundfile
transformers
datasets>=3.5.0,<4.0.0
numpy==1.26.4
sentencepiece>=0.2.0
Using below Python script fro Inference
import torch
import soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import numpy as np
import json
import os
# Load processor, model, and vocoder
processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
model = SpeechT5ForTextToSpeech.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
def generate_speech(text, voice, output_path="tests/test_output/tts_output.wav"):
print(f"Generating speech for text: {text}, voice: {voice}, output: {output_path}")
if not text or not voice:
return None, "Please provide both text and voice selection."
speaker_dict = {"male": 2000, "female": 7000}
try:
speaker_id = speaker_dict[voice.lower()]
speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0)
inputs = processor(text=text, return_tensors="pt")
with torch.no_grad():
speech = model.generate_speech(
inputs["input_ids"],
speaker_embeddings=speaker_embedding,
vocoder=vocoder,
attention_mask=inputs.get("attention_mask")
)
sf.write(output_path, speech.numpy(), samplerate=16000)
print(f"Audio saved to {output_path}")
return output_path, None
except Exception as e:
print(f"Error generating speech: {str(e)}")
return None, f"Error generating speech: {str(e)}"
text = "<input_text>"
voice = "Female" # choose "Male" or "Female"
generate_speech(text, voice, output_path="tests/test_output/tts_output.wav")
Show the Output TTS Audio
from IPython.display import Audio
Audio("tests/test_output/tts_output.wav")
Base model
microsoft/speecht5_tts