|
import gradio as gr |
|
import torch |
|
from TTS.api import TTS |
|
import os |
|
import spaces |
|
import tempfile |
|
from pymongo import MongoClient |
|
from dotenv import load_dotenv |
|
from huggingface_hub import hf_hub_download |
|
from transformers import AutoTokenizer |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
mongodb_uri = os.getenv('MONGODB_URI') |
|
hf_token = os.getenv('HF_TOKEN') |
|
|
|
|
|
client = MongoClient(mongodb_uri) |
|
db = client['mitra'] |
|
voices_collection = db['voices'] |
|
|
|
os.environ["COQUI_TOS_AGREED"] = "1" |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
def load_tts_model(): |
|
return TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) |
|
|
|
tts = load_tts_model() |
|
|
|
|
|
def get_celebrity_voices(): |
|
voices = {} |
|
for category in voices_collection.find(): |
|
for voice in category['voices']: |
|
voices[voice['name']] = f"voices/{voice['name']}.mp3" |
|
return voices |
|
|
|
celebrity_voices = get_celebrity_voices() |
|
|
|
def check_voice_files(): |
|
""" |
|
Checks if all voice files exist in the Hugging Face repository. |
|
Returns a message listing missing files or confirming all files are present. |
|
""" |
|
missing = [] |
|
for voice, path in celebrity_voices.items(): |
|
try: |
|
hf_hub_download(repo_id="nikkmitra/clone", filename=path, repo_type="space", token=hf_token) |
|
except Exception: |
|
missing.append(f"{voice}: {path}") |
|
if missing: |
|
return "**Missing Voice Files:**\n" + "\n".join(missing) |
|
else: |
|
return "**All voice files are present.** 🎉" |
|
|
|
|
|
|
|
|
|
def split_text_into_chunks(text, max_tokens=100, language="en"): |
|
""" |
|
Splits the input text into chunks with a maximum of `max_tokens` tokens each. |
|
Inserts a newline after each chunk. |
|
Uses a specialized tokenizer for Hindi language. |
|
""" |
|
|
|
chunks = [] |
|
for i in range(0, len(tokens), max_tokens): |
|
chunk = ' '.join(tokens[i:i + max_tokens]) |
|
chunks.append(chunk) |
|
return '\n'.join(chunks) |
|
|
|
@spaces.GPU(duration=120) |
|
def tts_generate(text, voice, language): |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: |
|
temp_audio_path = temp_audio.name |
|
|
|
try: |
|
voice_file = hf_hub_download(repo_id="nikkmitra/clone", filename=celebrity_voices[voice], repo_type="space", token=hf_token) |
|
except Exception as e: |
|
return f"Error downloading voice file: {e}" |
|
|
|
try: |
|
tts.tts_to_file( |
|
text=text, |
|
speaker_wav=voice_file, |
|
language=language, |
|
file_path=temp_audio_path |
|
) |
|
except AssertionError as ae: |
|
return f"Error: {ae}" |
|
except Exception as e: |
|
return f"An unexpected error occurred: {e}" |
|
|
|
return temp_audio_path |
|
|
|
@spaces.GPU(duration=120) |
|
def clone_voice(text, audio_file, language): |
|
print("cloning") |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: |
|
temp_audio_path = temp_audio.name |
|
|
|
try: |
|
tts.tts_to_file( |
|
text=text, |
|
speaker_wav=audio_file, |
|
language=language, |
|
file_path=temp_audio_path |
|
) |
|
except AssertionError as ae: |
|
return f"Error: {ae}" |
|
except Exception as e: |
|
return f"An unexpected error occurred: {e}" |
|
|
|
return temp_audio_path |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Advanced Voice Synthesis") |
|
|
|
|
|
voice_status = check_voice_files() |
|
gr.Markdown(voice_status) |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("TTS"): |
|
with gr.Row(): |
|
tts_text = gr.Textbox(label="Text to speak") |
|
tts_voice = gr.Dropdown(choices=list(celebrity_voices.keys()), label="Celebrity Voice") |
|
tts_language = gr.Dropdown(["en", "es", "fr", "de", "it", "ar","hi"], label="Language", value="en") |
|
tts_generate_btn = gr.Button("Generate") |
|
tts_output = gr.Audio(label="Generated Audio") |
|
|
|
tts_generate_btn.click( |
|
tts_generate, |
|
inputs=[tts_text, tts_voice, tts_language], |
|
outputs=tts_output |
|
) |
|
|
|
with gr.TabItem("Clone Voice"): |
|
with gr.Row(): |
|
clone_text = gr.Textbox(label="Text to speak") |
|
clone_audio = gr.Audio(label="Voice reference audio file", type="filepath") |
|
clone_language = gr.Dropdown(["en", "es", "fr", "de", "it", "ar", "hi"], label="Language", value="en") |
|
clone_generate_btn = gr.Button("Generate") |
|
clone_output = gr.Audio(label="Generated Audio") |
|
|
|
clone_generate_btn.click( |
|
clone_voice, |
|
inputs=[clone_text, clone_audio, clone_language], |
|
outputs=clone_output |
|
) |
|
|
|
|
|
demo.launch() |
|
|
|
|
|
for file in os.listdir(): |
|
if file.endswith('.wav') and file.startswith('tmp'): |
|
os.remove(file) |