Spaces:
Running
on
Zero
Running
on
Zero
import os | |
# Activer le serveur MCP | |
os.environ['GRADIO_MCP_SERVER'] = 'True' | |
import gradio as gr | |
import torchaudio | |
import torch | |
from pydub import AudioSegment, effects | |
import uuid | |
import subprocess | |
import time | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from pathlib import Path | |
import sys | |
from pydub.silence import split_on_silence | |
import re | |
from unicodedata import normalize | |
import numpy as np | |
import spaces | |
from huggingface_hub import snapshot_download | |
import threading | |
import requests | |
import tempfile | |
# Télécharger les ressources NLTK | |
nltk.download("punkt", quiet=True) | |
nltk.download("punkt_tab", quiet=True) | |
# Definition of problematic characters by language | |
PROBLEMATIC_CHARS = { | |
'global': ['&', '%', '@', '#', '$', '*', '+', '=', '()', '[]', '{}', '<>', '|', '/', '\\', '"', '…', '«', '»', '"', '"', ''', '''], | |
'fr': ['&', '%', '@', '#', '$', '*', '+', '=', 'etc.'], | |
'en': ['&', '%', '@', '#', '$', '*', '+', '=', 'etc.'], | |
# Add specific characters for each language as needed | |
} | |
# Replacement rules by language | |
REPLACEMENT_RULES = { | |
'global': { | |
'&': {'fr': ' et ', 'en': ' and ', 'es': ' y ', 'de': ' und ', 'it': ' e ', 'pt': ' e ', 'default': ' and '}, | |
'%': {'fr': ' pourcent ', 'en': ' percent ', 'de': ' prozent ', 'default': ' percent '}, | |
'@': {'fr': ' arobase ', 'en': ' at ', 'default': ' at '}, | |
'#': {'fr': ' hashtag ', 'en': ' hashtag ', 'default': ' hashtag '}, | |
'...': {'default': ', '}, | |
'…': {'default': ', '}, | |
'"': {'default': ''}, | |
"'": {'default': ''}, | |
'«': {'default': ''}, | |
'»': {'default': ''}, | |
'"': {'default': ''}, | |
'"': {'default': ''}, | |
''': {'default': ''}, | |
''': {'default': ''}, | |
}, | |
# You can add language-specific rules | |
} | |
def analyze_text(text, language_code): | |
"""Analyze text to detect potential pronunciation issues for voice synthesis. | |
This function examines text for problematic characters, special symbols, URLs, | |
numbers, and other elements that might affect speech quality in voice cloning. | |
Args: | |
text: The text to analyze for speech synthesis compatibility | |
language_code: Language code (en, fr, es, de, it, pt, pl, tr, ru, nl, cs, ar, zh, hu, ko, ja, hi) | |
Returns: | |
Dictionary containing detected issues and suggestions for improvement | |
""" | |
issues = [] | |
# Basic unicode normalization | |
normalized_text = normalize('NFC', text) | |
# Détection des emojis | |
import re | |
emoji_pattern = re.compile( | |
"[" | |
"\U0001F600-\U0001F64F" # emoticons | |
"\U0001F300-\U0001F5FF" # symbols & pictographs | |
"\U0001F680-\U0001F6FF" # transport & map symbols | |
"\U0001F700-\U0001F77F" # alchemical symbols | |
"\U0001F780-\U0001F7FF" # Geometric Shapes | |
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C | |
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs | |
"\U0001FA00-\U0001FA6F" # Chess Symbols | |
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A | |
"\U00002702-\U000027B0" # Dingbats | |
"\U000024C2-\U0001F251" | |
"]+", flags=re.UNICODE | |
) | |
emojis = emoji_pattern.findall(text) | |
if emojis: | |
issues.append({ | |
'type': 'emojis', | |
'description': 'Emojis that will be removed during preprocessing', | |
'instances': emojis, | |
'suggestion': 'Emojis are replaced with spaces for better pronunciation' | |
}) | |
# URL detection | |
urls = re.findall(r'https?://\S+|www\.\S+', text) | |
if urls: | |
issues.append({ | |
'type': 'url', | |
'description': 'Detected URLs that may be mispronounced', | |
'instances': urls, | |
'suggestion': 'Replace URLs with textual descriptions' | |
}) | |
# Email detection | |
emails = re.findall(r'\S+@\S+\.\S+', text) | |
if emails: | |
issues.append({ | |
'type': 'email', | |
'description': 'Detected email addresses that may be mispronounced', | |
'instances': emails, | |
'suggestion': 'Replace emails with descriptive text' | |
}) | |
# Detection of quotes and citation characters (completely exclude apostrophe) | |
quote_chars = ['"', '«', '»', '"', '"', ''', '''] | |
found_quotes = [] | |
# For English, completely exclude apostrophes from problematic characters | |
if language_code == 'en': | |
# Don't report apostrophes as problematic in English | |
pass | |
else: | |
# Look only for quotes, not apostrophes | |
for char in quote_chars: | |
if char in text: | |
found_quotes.append(char) | |
if found_quotes: | |
issues.append({ | |
'type': 'quotes', | |
'description': 'Quotes and citation characters that may affect pronunciation', | |
'instances': found_quotes, | |
'suggestion': 'Remove quotes and citation characters for better pronunciation' | |
}) | |
# Detection of problematic characters (exclude apostrophes) | |
global_chars = [c for c in PROBLEMATIC_CHARS.get('global', []) if c != "'"] | |
lang_specific_chars = PROBLEMATIC_CHARS.get(language_code, []) | |
all_problematic_chars = set(global_chars + lang_specific_chars) - set(quote_chars) # Exclude quotes already treated | |
found_chars = [] | |
for char in all_problematic_chars: | |
if char in text: | |
found_chars.append(char) | |
if found_chars: | |
issues.append({ | |
'type': 'special_chars', | |
'description': 'Special characters that may cause pronunciation problems', | |
'instances': found_chars, | |
'suggestion': 'Replace special characters with their textual equivalent' | |
}) | |
# Detection of long numbers (beyond 3 digits) | |
numbers = re.findall(r'\b\d{4,}\b', text) | |
if numbers: | |
suggestion = "Write numbers in full" | |
if language_code == 'fr': | |
suggestion += " or add spaces between thousands (e.g., 10 000)" | |
elif language_code == 'en': | |
suggestion += " or use commas for thousands (e.g., 10,000)" | |
issues.append({ | |
'type': 'numbers', | |
'description': 'Long numbers that may be mispronounced', | |
'instances': numbers, | |
'suggestion': suggestion | |
}) | |
# Detection of Roman numerals, with exception for the pronoun "I" in English | |
if language_code == 'en': | |
# In English, exclude "I" as a Roman numeral because it's a personal pronoun | |
roman_pattern = r'\b(?!I\b)[IVXLCDM]+\b' | |
roman_numerals = re.findall(roman_pattern, text) | |
if roman_numerals: | |
issues.append({ | |
'type': 'roman_numerals', | |
'description': 'Roman numerals that may be mispronounced', | |
'instances': roman_numerals, | |
'suggestion': 'Replace Roman numerals with Arabic numbers' | |
}) | |
else: | |
# For other languages, keep normal detection | |
roman_pattern = r'\b[IVXLCDM]+\b' | |
roman_numerals = re.findall(roman_pattern, text) | |
if roman_numerals: | |
issues.append({ | |
'type': 'roman_numerals', | |
'description': 'Roman numerals that may be mispronounced', | |
'instances': roman_numerals, | |
'suggestion': 'Replace Roman numerals with Arabic numbers' | |
}) | |
# Detection of abbreviations by language | |
abbreviation_patterns = { | |
'fr': [r'\bM\.\s', r'\bMme\.\s', r'\bMlle\.\s', r'\bDr\.\s', r'\bProf\.\s', r'\betc\.\s', r'\bex\.\s'], | |
'en': [r'\bMr\.\s', r'\bMrs\.\s', r'\bDr\.\s', r'\bProf\.\s', r'\betc\.\s', r'\be\.g\.\s', r'\bi\.e\.\s'], | |
'es': [r'\bSr\.\s', r'\bSra\.\s', r'\bDr\.\s', r'\betc\.\s'], | |
'default': [r'\b[A-Z]\.\s', r'\b[A-Z][a-z]+\.\s'] | |
} | |
patterns = abbreviation_patterns.get(language_code, abbreviation_patterns['default']) | |
found_abbrevs = [] | |
for pattern in patterns: | |
matches = re.findall(pattern, text) | |
found_abbrevs.extend(matches) | |
if found_abbrevs: | |
issues.append({ | |
'type': 'abbreviations', | |
'description': 'Detected abbreviations that may be mispronounced', | |
'instances': found_abbrevs, | |
'suggestion': 'Write abbreviations in full' | |
}) | |
# Detection of repeated punctuation | |
repeated_punct = re.findall(r'([!?.,;:]{2,})', text) | |
if repeated_punct: | |
issues.append({ | |
'type': 'repeated_punct', | |
'description': 'Repeated punctuation that may cause incorrect pauses', | |
'instances': repeated_punct, | |
'suggestion': 'Simplify punctuation (use only one character)' | |
}) | |
# Detection of missing spaces around punctuation, excluding decimal numbers | |
missing_spaces = [] | |
# Specific patterns to look for | |
patterns = [ | |
r'[a-zA-ZÀ-ÿ][,.;:!?][a-zA-ZÀ-ÿ]' # letter+punctuation+letter | |
] | |
# In English, exclude contractions with apostrophes (I'm, don't, isn't, etc.) | |
if language_code != 'en': | |
for pattern in patterns: | |
matches = re.findall(pattern, text) | |
if matches: | |
missing_spaces.extend(matches) | |
if missing_spaces: | |
issues.append({ | |
'type': 'missing_spaces', | |
'description': 'Punctuation without spaces that may affect pronunciation', | |
'instances': missing_spaces, | |
'suggestion': 'Add appropriate spaces around punctuation (except for decimal numbers)' | |
}) | |
# Detection of language-specific issues | |
if language_code == 'fr': | |
# Poorly formatted ordinal numbers in French | |
ordinals = re.findall(r'\b\d+(eme|ème|er|ere|ère)\b', text) | |
if ordinals: | |
issues.append({ | |
'type': 'fr_ordinals', | |
'description': 'Ordinal numbers that may be mispronounced', | |
'instances': ordinals, | |
'suggestion': 'Write ordinals in letters (premier, deuxième, etc.)' | |
}) | |
elif language_code == 'en': | |
# English-specific issues | |
dates = re.findall(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', text) | |
if dates: | |
issues.append({ | |
'type': 'en_dates', | |
'description': 'Dates in numeric format that may be misinterpreted', | |
'instances': dates, | |
'suggestion': 'Write dates in full (e.g., January 1st, 2022)' | |
}) | |
return { | |
'issues': issues, | |
'has_issues': len(issues) > 0, | |
'normalized_text': normalized_text | |
} | |
# Add a function to convert numbers to text | |
def number_to_text_fr(number_str): | |
""" | |
Converts a number (integer or decimal) to French text. | |
Args: | |
number_str (str): The number to convert to text format | |
Returns: | |
str: The number written out in words | |
""" | |
parts = number_str.replace(',', '.').split('.') | |
# Function to convert an integer to text | |
def int_to_text(n): | |
if n == '0': | |
return 'zéro' | |
units = ['', 'un', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf'] | |
teens = ['dix', 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dix-sept', 'dix-huit', 'dix-neuf'] | |
tens = ['', 'dix', 'vingt', 'trente', 'quarante', 'cinquante', 'soixante', 'soixante', 'quatre-vingt', 'quatre-vingt'] | |
n = int(n) | |
if n < 10: | |
return units[n] | |
elif n < 20: | |
return teens[n-10] | |
elif n < 70: | |
div, mod = divmod(n, 10) | |
return tens[div] + ('-et-un' if mod == 1 else ('-' + units[mod] if mod else '')) | |
elif n < 80: | |
div, mod = divmod(n, 10) | |
return tens[div] + ('-' + teens[mod-10] if mod else '') | |
elif n < 90: | |
div, mod = divmod(n, 10) | |
return tens[div] + (('-' + units[mod]) if mod else 's') | |
elif n < 100: | |
div, mod = divmod(n, 10) | |
return tens[div] + ('-' + teens[mod-10] if mod else 's') | |
else: | |
if n < 200: | |
return 'cent' + (' ' + int_to_text(n % 100) if n % 100 else '') | |
else: | |
div, mod = divmod(n, 100) | |
return int_to_text(div) + ' cent' + ('s' if div > 1 and mod == 0 else '') + (' ' + int_to_text(mod) if mod else '') | |
# Process the integer part | |
integer_part = int_to_text(parts[0]) | |
# If there's a decimal part | |
if len(parts) > 1 and parts[1]: | |
# If the decimal part is 1 or 2 digits | |
decimal_part = parts[1] | |
if len(decimal_part) <= 2: | |
decimal_text = int_to_text(decimal_part) | |
# For 01, 02, etc. we say "un", "deux", etc. rather than "un", "deux" | |
if len(decimal_part) == 2 and decimal_part[0] == '0': | |
decimal_text = int_to_text(decimal_part[1]) | |
return f"{integer_part} virgule {decimal_text}" | |
else: | |
# For more than 2 digits, we pronounce each digit | |
decimal_text = ' '.join(int_to_text(d) for d in decimal_part) | |
return f"{integer_part} virgule {decimal_text}" | |
return integer_part | |
def preprocess_text(text, language_code, apply_replacements=True): | |
"""Preprocess and clean text for optimal voice synthesis results. | |
This function automatically fixes common text issues like special characters, | |
numbers, URLs, and language-specific elements to improve speech quality. | |
Args: | |
text: The text to preprocess for voice synthesis | |
language_code: Language code (en, fr, es, de, it, pt, pl, tr, ru, nl, cs, ar, zh, hu, ko, ja, hi) | |
apply_replacements: If True, applies automatic character replacements for better pronunciation | |
Returns: | |
The preprocessed text ready for high-quality voice synthesis | |
""" | |
# Unicode normalization | |
text = normalize('NFC', text) | |
if apply_replacements: | |
# Détection et suppression des emojis et caractères spéciaux Unicode | |
import re | |
# Regex pour détecter les emojis et symboles Unicode | |
emoji_pattern = re.compile( | |
"[" | |
"\U0001F600-\U0001F64F" # emoticons | |
"\U0001F300-\U0001F5FF" # symbols & pictographs | |
"\U0001F680-\U0001F6FF" # transport & map symbols | |
"\U0001F700-\U0001F77F" # alchemical symbols | |
"\U0001F780-\U0001F7FF" # Geometric Shapes | |
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C | |
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs | |
"\U0001FA00-\U0001FA6F" # Chess Symbols | |
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A | |
"\U00002702-\U000027B0" # Dingbats | |
"\U000024C2-\U0001F251" | |
"]+", flags=re.UNICODE | |
) | |
# Remplacer les emojis par un espace | |
text = emoji_pattern.sub(' ', text) | |
# Apply global replacement rules | |
for char, replacements in REPLACEMENT_RULES.get('global', {}).items(): | |
if char in text: | |
# Use language-specific rule if available, otherwise default rule | |
replacement = replacements.get(language_code, replacements.get('default', char)) | |
text = text.replace(char, replacement) | |
# Transform URLs and emails | |
text = re.sub(r'https?://\S+|www\.\S+', ' URL link ', text) | |
text = re.sub(r'\S+@\S+\.\S+', ' email address ', text) | |
# Process quotes (removal or replacement) | |
# Straight quotes " and ' | |
text = text.replace('"', '') | |
text = text.replace("'", '') | |
# French quotes « and » | |
text = text.replace('«', '') | |
text = text.replace('»', '') | |
# Smart typographic quotes (curly quotes) | |
text = text.replace('"', '') # opening quote | |
text = text.replace('"', '') # closing quote | |
text = text.replace(''', '') # opening apostrophe | |
text = text.replace(''', '') # closing apostrophe | |
# Replace Roman numerals with their equivalent (if needed) | |
if language_code in ['fr', 'en', 'es', 'it', 'pt']: | |
roman_numerals = { | |
'I': '1', 'II': '2', 'III': '3', 'IV': '4', 'V': '5', | |
'VI': '6', 'VII': '7', 'VIII': '8', 'IX': '9', 'X': '10', | |
'XI': '11', 'XII': '12', 'XIII': '13', 'XIV': '14', 'XV': '15', | |
'XVI': '16', 'XVII': '17', 'XVIII': '18', 'XIX': '19', 'XX': '20' | |
} | |
# Exception for the personal pronoun "I" in English | |
if language_code == 'en': | |
# Use a regex that only detects true Roman numerals | |
# and not the personal pronoun "I" in English | |
for roman, arabic in roman_numerals.items(): | |
if roman == 'I': | |
# For "I" in English, check that it's not alone or between spaces | |
# A true Roman numeral I will typically be followed by a period or | |
# in a numeric context | |
text = re.sub(r'\b(I)\b(?!\'m|\'ve|\'ll|\'d|\.)', roman, text) # Preserve "I" pronoun | |
text = re.sub(r'\b(I)\.', arabic + '.', text) # I. => 1. | |
else: | |
# For other Roman numerals, standard behavior | |
text = re.sub(fr'\b{roman}\b', arabic, text) | |
else: | |
# For other languages, replace all Roman numerals | |
for roman, arabic in roman_numerals.items(): | |
text = re.sub(fr'\b{roman}\b', arabic, text) | |
# Language-specific processing for French | |
if language_code == 'fr': | |
# Replace common numbers | |
text = re.sub(r'\b1er\b', 'premier', text) | |
text = re.sub(r'\b1ère\b', 'première', text) | |
text = re.sub(r'\b(\d+)(ème)\b', r'\1 ième', text) | |
# Improved processing of decimal numbers and percentages in French | |
# Search for patterns like "2,95 %" or "2,95%" | |
def replace_decimal_percent(match): | |
num = match.group(1) | |
return number_to_text_fr(num) + " pour cent" | |
# Search for decimal numbers followed by % (with or without space) | |
text = re.sub(r'(\d+,\d+)\s*%', replace_decimal_percent, text) | |
# Process decimal numbers without percentage | |
def replace_decimal(match): | |
return number_to_text_fr(match.group(0)) | |
# Search for decimal numbers (with comma) | |
text = re.sub(r'\b\d+,\d+\b', replace_decimal, text) | |
# Process simple percentages | |
text = re.sub(r'(\d+)\s*%', lambda m: number_to_text_fr(m.group(1)) + " pour cent", text) | |
# Apply French typographical rules for punctuation: | |
# - No space before: . , ... ) ] } | |
# - Space after: . , ... ) ] } | |
# - Space before and after: : ; ! ? « » | |
# First, normalize by removing all spaces around punctuation | |
text = re.sub(r'\s*([.,;:!?\[\]\(\)\{\}])\s*', r'\1', text) | |
# Then, add spaces according to French rules | |
# Simple punctuation with space after only | |
text = re.sub(r'([.,)])', r'\1 ', text) | |
# Punctuation with space before and after | |
text = re.sub(r'([;:!?])', r' \1 ', text) | |
# Special case for French quotes | |
text = re.sub(r'«', r'« ', text) | |
text = re.sub(r'»', r' »', text) | |
# Language-specific processing for English | |
elif language_code == 'en': | |
# Replace ordinals | |
text = re.sub(r'\b1st\b', 'first', text) | |
text = re.sub(r'\b2nd\b', 'second', text) | |
text = re.sub(r'\b3rd\b', 'third', text) | |
text = re.sub(r'\b(\d+)th\b', r'\1th', text) | |
# Process percentages in English (decimals with point) | |
text = re.sub(r'(\d+\.\d+)%', r'\1 percent', text) | |
text = re.sub(r'(\d+)%', r'\1 percent', text) | |
# English typographical rules: no space before punctuation, space after | |
text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) | |
# For other languages, general rule: no space before, space after punctuation | |
else: | |
text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) | |
# Clean up multiple spaces | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
def format_issues_for_display(analysis_result, language_code, tokenizer_analysis=None): | |
""" | |
Formats detected issues for display in the interface. | |
Args: | |
analysis_result (dict): Result of the text analysis | |
language_code (str): Language code | |
tokenizer_analysis (dict): Result of tokenizer analysis (optional) | |
Returns: | |
str: Formatted text for display | |
""" | |
if not analysis_result['has_issues'] and (tokenizer_analysis is None or not tokenizer_analysis['has_issues']): | |
return "✅ No issues detected in the text." | |
formatted_text = "⚠️ Potential issues detected:\n\n" | |
# Format standard text analysis issues | |
if analysis_result['has_issues']: | |
formatted_text += "📊 Text analysis results:\n" | |
for issue in analysis_result['issues']: | |
formatted_text += f"- {issue['description']}:\n" | |
formatted_text += f" • Detected: {', '.join(repr(i) for i in issue['instances'])}\n" | |
formatted_text += f" • Suggestion: {issue['suggestion']}\n\n" | |
# Format tokenizer analysis issues (if available) | |
if tokenizer_analysis and tokenizer_analysis['has_issues']: | |
formatted_text += "\n🔍 Tokenizer analysis results:\n" | |
for issue in tokenizer_analysis['issues']: | |
formatted_text += f"- {issue['description']}:\n" | |
formatted_text += f" • Detected: {', '.join(repr(i) for i in issue['instances'])}\n" | |
formatted_text += f" • Suggestion: {issue['suggestion']}\n\n" | |
if 'cleaned_text' in tokenizer_analysis: | |
formatted_text += "\n📝 Cleaned text by XTTS tokenizer:\n" | |
formatted_text += f"{tokenizer_analysis['cleaned_text']}\n\n" | |
formatted_text += "\nEnable text preprocessing to automatically fix some of these issues." | |
return formatted_text | |
repo_id = "XTTS-v2" | |
# Télécharger le modèle seulement s'il n'existe pas déjà | |
if not os.path.exists(repo_id) or not os.path.exists(os.path.join(repo_id, "config.json")): | |
try: | |
print("Téléchargement du modèle XTTS-v2...") | |
snapshot_download( | |
repo_id="coqui/XTTS-v2", | |
local_dir=repo_id, | |
allow_patterns=["*.safetensors", "*.wav", "*.json", "*.pth"] | |
) | |
print("Modèle téléchargé avec succès!") | |
except Exception as e: | |
print(f"Erreur lors du téléchargement: {e}") | |
print("Essai avec git clone...") | |
try: | |
import subprocess | |
result = subprocess.run( | |
["git", "clone", "https://huggingface.co/coqui/XTTS-v2", repo_id], | |
capture_output=True, | |
text=True | |
) | |
if result.returncode == 0: | |
print("Modèle téléchargé avec git clone!") | |
else: | |
print(f"Erreur git clone: {result.stderr}") | |
raise Exception("Impossible de télécharger le modèle") | |
except Exception as git_error: | |
print(f"Erreur git clone: {git_error}") | |
raise Exception("Veuillez télécharger le modèle manuellement avec: git clone https://huggingface.co/coqui/XTTS-v2") | |
else: | |
print("Modèle XTTS-v2 déjà présent.") | |
# Relative path management | |
BASE_DIR = Path(os.path.dirname(os.path.abspath(__file__))) | |
MODELS_DIR = repo_id # BASE_DIR / "XTTS-v2" | |
REF_AUDIO_DIR = BASE_DIR / "ref_audio_files" | |
OUTPUT_DIR = BASE_DIR / "outputs" | |
TEMP_DIR = OUTPUT_DIR / "temp" | |
# Create necessary folders | |
REF_AUDIO_DIR.mkdir(exist_ok=True) | |
OUTPUT_DIR.mkdir(exist_ok=True) | |
TEMP_DIR.mkdir(exist_ok=True) | |
# Languages supported by XTTS | |
SUPPORTED_LANGUAGES = { | |
"English": "en", | |
"French": "fr", | |
"Spanish": "es", | |
"German": "de", | |
"Italian": "it", | |
"Portuguese": "pt", | |
"Polish": "pl", | |
"Turkish": "tr", | |
"Russian": "ru", | |
"Dutch": "nl", | |
"Czech": "cs", | |
"Arabic": "ar", | |
"Chinese": "zh-cn", | |
"Japanese": "ja", | |
"Korean": "ko", | |
"Hungarian": "hu", | |
"Hindi": "hi" | |
} | |
print(f"Initializing model from: {MODELS_DIR}") | |
# Clean temporary files | |
def cleanup_temp_files(): | |
"""Cleans temporary files in the TEMP_DIR folder""" | |
try: | |
for file in TEMP_DIR.glob("*"): | |
if file.is_file(): | |
os.remove(file) | |
except Exception as e: | |
print(f"Error while cleaning temporary files: {e}") | |
# Clean old generated MP3 files (optional) | |
def cleanup_old_outputs(max_age_days=7): | |
"""Deletes MP3 files older than max_age_days in the OUTPUT_DIR folder""" | |
try: | |
now = time.time() | |
for file in OUTPUT_DIR.glob("*.mp3"): | |
if file.is_file(): | |
# If the file is older than max_age_days | |
if os.path.getmtime(file) < now - (max_age_days * 86400): | |
os.remove(file) | |
except Exception as e: | |
print("error cleanup old outputs") | |
# Import XTTS modules | |
try: | |
from TTS.tts.configs.xtts_config import XttsConfig | |
from TTS.tts.models.xtts import Xtts | |
except ImportError as e: | |
print(f"TTS import error: {e}") | |
print("Please install dependencies with: pip install coqui-tts") | |
sys.exit(1) | |
# Install language-specific dependencies | |
def install_language_dependencies(): | |
"""Check and install required dependencies for Asian languages""" | |
try: | |
# For Chinese (zh-cn) | |
try: | |
import pypinyin | |
except ImportError: | |
subprocess.check_call([sys.executable, "-m", "pip", "install", "pypinyin"]) | |
# For Japanese (ja) | |
try: | |
import cutlet | |
# Test if fugashi and mecab are also installed | |
try: | |
import fugashi | |
except ImportError: | |
subprocess.check_call([sys.executable, "-m", "pip", "install", "fugashi", "mecab-python3", "unidic-lite"]) | |
except ImportError: | |
subprocess.check_call([sys.executable, "-m", "pip", "install", "cutlet", "fugashi", "mecab-python3", "unidic-lite"]) | |
# For Korean (ko) | |
try: | |
import hangul_romanize | |
except ImportError: | |
subprocess.check_call([sys.executable, "-m", "pip", "install", "hangul-romanize"]) | |
return True | |
except Exception as e: | |
return False | |
# Model initialization and configuration | |
try: | |
# Try to install language dependencies | |
install_language_dependencies() | |
config = XttsConfig() | |
config.load_json(str("XTTS-v2/config.json")) | |
model = Xtts.init_from_config(config) | |
# model.load_safetensors_checkpoint( | |
# config, checkpoint_dir=MODELS_DIR, use_deepspeed=False | |
#) | |
model.load_checkpoint(config, checkpoint_dir=str(MODELS_DIR), eval=True) | |
if torch.cuda.is_available(): | |
model.cuda() | |
print("Model loaded on GPU") | |
else: | |
print("GPU not available, using CPU") | |
except Exception as e: | |
print(f"Error loading model: {e}") | |
print(f"Make sure the XTTS-v2 model is present in: {MODELS_DIR}") | |
sys.exit(1) | |
def remove_silence( | |
audio_segment, | |
silence_thresh=-45, | |
min_silence_len=300, | |
keep_silence=100 | |
): | |
""" | |
Optimisé: Coupe audio_segment autour des silences puis reconstruit l'audio | |
en supprimant les silences. Ajuste silence_thresh et min_silence_len | |
en fonction du niveau sonore de votre audio. | |
""" | |
# Vérifie que l'audio n'est pas trop court pour éviter les problèmes | |
if len(audio_segment) < 1000: # moins d'une seconde | |
return audio_segment | |
# Première tentative avec les paramètres fournis | |
chunks = split_on_silence( | |
audio_segment, | |
min_silence_len=min_silence_len, | |
silence_thresh=silence_thresh, | |
keep_silence=keep_silence | |
) | |
# Si aucun segment n'est détecté ou peu de segments, ajuster les paramètres | |
if not chunks or len(chunks) < 2: | |
# Essayer avec des paramètres plus souples | |
chunks = split_on_silence( | |
audio_segment, | |
min_silence_len=200, # Réduire pour détecter des silences plus courts | |
silence_thresh=silence_thresh + 5, # Augmenter le seuil (moins négatif) pour détecter plus de silences | |
keep_silence=keep_silence | |
) | |
# Recombiner toutes les pièces non silencieuses | |
if chunks: | |
processed_audio = AudioSegment.empty() | |
for chunk in chunks: | |
processed_audio += chunk | |
# Vérifier que l'audio n'a pas été trop raccourci | |
length_ratio = len(processed_audio) / len(audio_segment) | |
if length_ratio < 0.7: # Si plus de 30% a été supprimé | |
# Garder une version moins agressive | |
chunks = split_on_silence( | |
audio_segment, | |
min_silence_len=min_silence_len * 2, # Plus long, détecte moins de silences | |
silence_thresh=silence_thresh - 5, # Plus strict (plus négatif) | |
keep_silence=keep_silence * 2 # Garder plus de silence | |
) | |
if chunks: | |
processed_audio = AudioSegment.empty() | |
for chunk in chunks: | |
processed_audio += chunk | |
else: | |
return audio_segment | |
return processed_audio | |
else: | |
# Si tout l'audio est considéré comme du silence, retourner l'original | |
return audio_segment | |
def chunk_sentence_by_words(sentence, max_length=200): | |
""" | |
Divise une phrase en sous-chunks (max. max_length caractères) | |
sans couper au milieu d'un mot. | |
Optimisé pour la performance. | |
""" | |
# Si la phrase est déjà suffisamment courte, la retourner directement | |
if len(sentence) <= max_length: | |
return [sentence] | |
words = sentence.split() # division par mots | |
sub_chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
for word in words: | |
# Si ajouter ce mot dépasserait la longueur max, commencer un nouveau chunk | |
word_len = len(word) + (1 if current_length > 0 else 0) # +1 pour l'espace | |
if current_length + word_len > max_length: | |
if current_chunk: # S'assurer qu'on a quelque chose à ajouter | |
sub_chunks.append(" ".join(current_chunk)) | |
current_chunk = [] | |
current_length = 0 | |
# Traiter les mots individuels qui sont plus longs que max_length | |
if len(word) > max_length: | |
sub_chunks.append(word) | |
continue | |
# Ajouter le mot au chunk courant | |
current_chunk.append(word) | |
current_length += word_len | |
# Ajouter le dernier chunk s'il en reste | |
if current_chunk: | |
sub_chunks.append(" ".join(current_chunk)) | |
return sub_chunks | |
def split_text(text, max_length=150): | |
""" | |
- Divise 'text' en phrases (via sent_tokenize). | |
- Si une phrase dépasse max_length, la divise mot par mot | |
en utilisant chunk_sentence_by_words. | |
- Retourne une liste de chunks, chacun ≤ max_length caractères. | |
Optimisé pour la performance. | |
""" | |
# Vérifier que le texte n'est pas vide | |
if not text.strip(): | |
return [] | |
# Division en phrases avec gestion d'erreur améliorée | |
try: | |
raw_sentences = sent_tokenize(text) | |
if not raw_sentences: | |
raw_sentences = [text] | |
except Exception as e: | |
# En cas d'erreur, utiliser une simple division par points | |
raw_sentences = [s.strip() + '.' for s in text.split('.') if s.strip()] | |
if not raw_sentences: | |
raw_sentences = [text] | |
# Initialiser la liste finale de chunks | |
final_chunks = [] | |
# Traiter chaque phrase | |
for sentence in raw_sentences: | |
sentence = sentence.strip() | |
if not sentence: | |
continue | |
# Si la phrase entière est courte, l'ajouter directement | |
if len(sentence) <= max_length: | |
final_chunks.append(sentence) | |
else: | |
# Sinon, la diviser en sous-chunks | |
sub_chunks = chunk_sentence_by_words(sentence, max_length) | |
final_chunks.extend(sub_chunks) | |
# S'assurer qu'on a des chunks | |
if not final_chunks: | |
for i in range(0, len(text), max_length): | |
chunk = text[i:i+max_length] | |
if chunk.strip(): # Ne pas ajouter de segments vides | |
final_chunks.append(chunk) | |
return final_chunks | |
def check_language_dependencies(language): | |
""" | |
Vérifie les dépendances nécessaires pour une langue donnée. | |
Cette fonction s'exécute sur CPU. | |
Args: | |
language (str): Code de langue à vérifier | |
Returns: | |
tuple: (None, None) si tout est ok, ou (None, message_erreur) si problème | |
""" | |
# Dépendances spécifiques par langue | |
language_dependencies = { | |
"zh-cn": "pypinyin", | |
"ja": "cutlet,fugashi,unidic-lite", | |
"ko": "hangul-romanize", | |
} | |
if language in language_dependencies: | |
try: | |
# Essayer d'importer dynamiquement la dépendance | |
if language == "zh-cn": | |
import importlib | |
importlib.import_module("pypinyin") | |
elif language == "ja": | |
import importlib | |
importlib.import_module("cutlet") | |
# Vérifier les dépendances supplémentaires pour le japonais | |
try: | |
importlib.import_module("fugashi") | |
# Vérifier si unidic-lite est installé | |
try: | |
import unidic_lite | |
except ImportError: | |
raise ImportError("Japanese requires: unidic-lite") | |
except ImportError: | |
raise ImportError("Japanese requires: fugashi and unidic-lite") | |
elif language == "ko": | |
import importlib | |
importlib.import_module("hangul_romanize") | |
except ImportError as e: | |
dependency = language_dependencies[language] | |
language_name = { | |
"zh-cn": "Chinese", | |
"ja": "Japanese", | |
"ko": "Korean" | |
}[language] | |
# Message personnalisé pour les dépendances japonaises | |
if language == "ja" and "fugashi" in str(e): | |
install_command = "pip install fugashi mecab-python3 unidic-lite" | |
error_message = f""" | |
Error: Missing dependencies for {language_name} language. | |
Please run the following command to install the required packages: | |
{install_command} | |
Then restart the application. | |
""" | |
else: | |
install_command = f"pip install {dependency}" | |
error_message = f""" | |
Error: Missing dependency for {language_name} language. | |
Please run the following command to install the required package: | |
{install_command} | |
Then restart the application. | |
""" | |
return None, error_message | |
return None, None | |
def synthesize_speech( | |
text, | |
language, | |
temperature, | |
speed, | |
reference_audio, | |
do_sample=True, | |
repetition_penalty=1.0, | |
length_penalty=1.0, | |
gpt_cond_len=30, | |
top_k=50, | |
top_p=0.85, | |
remove_silence_enabled=True, | |
silence_threshold=-45, | |
min_silence_len=300, | |
keep_silence=100, | |
text_splitting_method="Native XTTS splitting", | |
max_chars_per_segment=250, | |
enable_preprocessing=True | |
): | |
"""Generate speech from text by orchestrating preprocessing, synthesis, and post-processing. | |
This function acts as the main pipeline for TTS generation. It takes raw text and parameters, | |
handles dependencies, preprocesses text, generates a raw audio waveform using the XTTS model, | |
and then post-processes the audio (normalization, silence removal) to produce a final MP3 file. | |
Args: | |
text (str): The text to convert to speech. | |
language (str): Language code for synthesis (e.g., 'en', 'fr'). | |
temperature (float): Controls randomness in generation (0.1-1.5, recommended: 0.75). | |
speed (float): Speech speed multiplier (0.5-2.0, 1.0 = normal speed). | |
reference_audio (str): File path or URL to reference audio for voice cloning. | |
do_sample (bool): Enable sampling for more natural speech variation. | |
repetition_penalty (float): Penalty for repetitive speech (1.0-5.0, recommended: 5.0). | |
length_penalty (float): Penalty affecting speech length (1.0-2.0, recommended: 1.0). | |
gpt_cond_len (int): Conditioning length for GPT model (10-50, recommended: 30). | |
top_k (int): Top-K sampling parameter (0-50, 0 = disabled). | |
top_p (float): Top-P sampling parameter (0.0-1.0, 0 = disabled). | |
remove_silence_enabled (bool): Remove silent parts from generated audio. | |
silence_threshold (int): dB threshold for silence detection (-60 to -20). | |
min_silence_len (int): Minimum silence length in ms to detect (300-1000). | |
keep_silence (int): Amount of silence to keep in ms (100-500). | |
text_splitting_method (str): Method for splitting long text. | |
max_chars_per_segment (int): Maximum characters per segment for custom splitting. | |
enable_preprocessing (bool): Automatically preprocess text for better pronunciation. | |
Returns: | |
tuple: (audio_file_path, error_message, preprocessed_text) | |
- audio_file_path (str): Path to the generated MP3 audio file, or None on error. | |
- error_message (str): A description of the error if one occurred, otherwise None. | |
- preprocessed_text (str): The text after preprocessing has been applied. | |
""" | |
# Part 1: Validation and Parameter Setup | |
if not text.strip(): | |
return None, "Error: Text cannot be empty", text | |
_, error_message = check_language_dependencies(language) | |
if error_message: | |
return None, error_message, text | |
if top_k == 0: | |
top_k = None | |
if top_p == 0: | |
top_p = None | |
if temperature <= 0: | |
temperature = 0.75 | |
if repetition_penalty <= 0: | |
repetition_penalty = 5.0 | |
if length_penalty <= 0: | |
length_penalty = 1.0 | |
reference_audio_path = reference_audio | |
# Part 2: Text Preprocessing | |
preprocessed_text = text | |
if enable_preprocessing: | |
preprocessed_text = preprocess_text(text, language) | |
print(f"Preprocessed text: {preprocessed_text}") | |
# Part 3: Waveform Generation (Core Synthesis) | |
try: | |
if text_splitting_method == "Custom splitting": | |
text_chunks = split_text(preprocessed_text, max_length=max_chars_per_segment) | |
print(f"Text split into {len(text_chunks)} segments (max {max_chars_per_segment} characters per segment)") | |
if not text_chunks: | |
return None, "Error: The text could not be split into segments", preprocessed_text | |
outputs_wav_list = [] | |
for i, chunk in enumerate(text_chunks): | |
print(f"Processing segment {i+1}/{len(text_chunks)}: {chunk}") | |
chunk_output = model.synthesize( | |
chunk, config, speaker_wav=reference_audio_path, language=language, | |
temperature=temperature, do_sample=do_sample, speed=speed, | |
enable_text_splitting=True, repetition_penalty=repetition_penalty, | |
length_penalty=length_penalty, gpt_cond_len=gpt_cond_len, top_k=top_k, top_p=top_p | |
) | |
outputs_wav_list.append(chunk_output["wav"]) | |
if outputs_wav_list: | |
outputs_wav = np.concatenate(outputs_wav_list) | |
else: | |
return None, "Error: No audio segment could be generated", preprocessed_text | |
else: | |
# Always enable native XTTS splitting by default for better AI agent compatibility | |
use_native_splitting = True | |
if text_splitting_method == "No splitting": | |
use_native_splitting = False | |
print("Native XTTS splitting disabled by user request") | |
elif len(preprocessed_text) > 150: | |
print("Long text detected: native XTTS splitting is enabled") | |
use_native_splitting = True | |
print(f"Generating with parameters: temperature={temperature}, do_sample={do_sample}, repetition_penalty={repetition_penalty}, length_penalty={length_penalty}, top_k={top_k}, top_p={top_p}, enable_text_splitting={use_native_splitting}") | |
outputs = model.synthesize( | |
preprocessed_text, config, speaker_wav=reference_audio_path, language=language, | |
temperature=temperature, do_sample=do_sample, speed=speed, | |
enable_text_splitting=use_native_splitting, repetition_penalty=repetition_penalty, | |
length_penalty=length_penalty, gpt_cond_len=gpt_cond_len, top_k=top_k, top_p=top_p | |
) | |
outputs_wav = outputs["wav"] | |
except Exception as e: | |
error_message = f"Error during audio generation: {str(e)}" | |
print(error_message) | |
error_str = str(e) | |
if "Chinese requires: pypinyin" in error_str: | |
error_message = "Error: Missing pypinyin package for Chinese language support.\n\nPlease run: pip install pypinyin" | |
elif "No module named 'cutlet'" in error_str: | |
error_message = "Error: Missing cutlet package for Japanese language support.\n\nPlease run: pip install cutlet" | |
elif "Japanese requires: fugashi" in error_str: | |
error_message = "Error: Missing fugashi package for Japanese language support.\n\nPlease run: pip install fugashi mecab-python3 unidic-lite" | |
elif "Japanese requires: unidic-lite" in error_str: | |
error_message = "Error: Missing unidic-lite package for Japanese language support.\n\nPlease run: pip install unidic-lite" | |
elif "Failed initializing MeCab" in error_str or "no such file or directory: /usr/local/etc/mecabrc" in error_str: | |
error_message = """Error: MeCab initialization failed for Japanese language support. | |
Please run: pip install fugashi mecab-python3 unidic-lite | |
If the error persists, you may need to install MeCab dictionaries: | |
- For Ubuntu/Debian: sudo apt-get install mecab mecab-ipadic | |
- For macOS with Homebrew: brew install mecab mecab-ipadic | |
""" | |
elif "Korean requires: hangul_romanize" in error_str: | |
error_message = "Error: Missing hangul-romanize package for Korean language support.\n\nPlease run: pip install hangul-romanize" | |
return None, error_message, preprocessed_text | |
# Part 4: Audio Post-Processing | |
try: | |
temp_audio_path = str(TEMP_DIR / f"temp_chunk_audio_{uuid.uuid4()}.wav") | |
torchaudio.save(temp_audio_path, torch.tensor(outputs_wav).unsqueeze(0), 24000) | |
audio_segment = AudioSegment.from_wav(temp_audio_path) | |
# Normalisation du volume de manière moins agressive | |
target_dbfs = -18.0 | |
current_dbfs = audio_segment.dBFS | |
if current_dbfs < -50: | |
delta_db = -18.0 - current_dbfs | |
delta_db = min(delta_db, 20.0) | |
audio_segment = audio_segment.apply_gain(delta_db) | |
else: | |
delta_db = target_dbfs - current_dbfs | |
audio_segment = audio_segment.apply_gain(delta_db) | |
combined_audio = audio_segment | |
# Suppression des silences si activée | |
if remove_silence_enabled: | |
padding = AudioSegment.silent(duration=500, frame_rate=combined_audio.frame_rate) | |
padded_audio = padding + combined_audio + padding | |
processed_audio = remove_silence( | |
padded_audio, | |
silence_thresh=silence_threshold, | |
min_silence_len=min_silence_len, | |
keep_silence=keep_silence | |
) | |
if len(processed_audio) > len(combined_audio) + 900: | |
trim_length = min(500, len(processed_audio) // 10) | |
combined_audio = processed_audio[trim_length:-trim_length] | |
else: | |
combined_audio = processed_audio | |
timestamp = time.strftime("%Y%m%d-%H%M%S") | |
final_output_path = str(TEMP_DIR / f"temp_output_{timestamp}_{uuid.uuid4()}.mp3") | |
combined_audio.export(final_output_path, format="mp3", bitrate="192k") | |
try: | |
os.remove(temp_audio_path) | |
except: | |
pass | |
return final_output_path, None, preprocessed_text | |
except Exception as e: | |
error_message = f"Error during audio processing: {str(e)}" | |
print(error_message) | |
return None, error_message, preprocessed_text | |
def download_audio_from_url(url): | |
"""Downloads an audio file from a URL and saves it to a temporary file.""" | |
try: | |
if not url.startswith(('http://', 'https://')): | |
raise ValueError("URL must start with http:// or https://") | |
response = requests.get(url, stream=True, timeout=20) # 20 seconds timeout | |
response.raise_for_status() | |
# Use a temporary file to store the audio | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: | |
for chunk in response.iter_content(chunk_size=8192): | |
temp_audio.write(chunk) | |
print(f"Audio downloaded from {url} to {temp_audio.name}") | |
return temp_audio.name | |
except (requests.exceptions.RequestException, ValueError) as e: | |
print(f"Failed to download audio from {url}: {e}") | |
return None | |
def voice_clone_synthesis( | |
text: str, | |
reference_audio_url: str = None, | |
example_audio_name: str = None, | |
language: str = "English", | |
temperature: float = 0.75, | |
speed: float = 1.0, | |
do_sample: bool = True, | |
repetition_penalty: float = 5.0, | |
length_penalty: float = 1.0, | |
gpt_cond_len: int = 30, | |
top_k: int = 50, | |
top_p: float = 0.85, | |
remove_silence_enabled: bool = True, | |
silence_threshold: int = -45, | |
min_silence_len: int = 300, | |
keep_silence: int = 100, | |
text_splitting_method: str = "Native XTTS splitting", | |
max_chars_per_segment: int = 250, | |
enable_preprocessing: bool = False | |
): | |
""" | |
🎤 Generates speech by cloning a voice from a reference audio URL. | |
This tool takes text and a URL to a reference audio file, and synthesizes | |
the text in the voice from the reference audio. It supports 17 languages | |
and offers advanced control over the generation process. | |
Args: | |
text (str): The text to be synthesized. Required. | |
reference_audio_url (str, optional): A public URL pointing to a reference audio file (WAV or MP3). | |
Provide this OR example_audio_name, but not both. | |
example_audio_name (str, optional): The name of a pre-defined example audio file. | |
Valid choices: 'audio_1.wav', 'audio_2.wav', 'audio_3.wav', 'audio_4.wav', 'audio_5.wav', | |
'guzel_ses.wav', 'guzel_ses_rapide.wav'. Provide this OR reference_audio_url, but not both. | |
language (str): The language of the text. Defaults to "English". | |
Supported languages: English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, | |
Russian, Dutch, Czech, Arabic, Chinese, Japanese, Korean, Hungarian, Hindi. | |
temperature (float): Controls the randomness of the output. Higher values make it more random. | |
Range: 0.1-1.5. Default: 0.75. Recommended: 0.75 for balanced output. | |
speed (float): The speed of the generated speech. | |
Range: 0.5-2.0. Default: 1.0. Example: 0.8 = slower, 1.2 = faster. | |
do_sample (bool): Whether to use sampling for generation. Recommended: True. Default: True. | |
repetition_penalty (float): Penalty for repeating words or phrases. IMPORTANT: Must be > 1.0. | |
Range: 1.0-5.0. Default: 5.0. Higher values reduce repetition. AI agents should use values like 1.1, 1.5, 2.0, 3.0, 4.0, 5.0. | |
length_penalty (float): Penalty for sentence length. IMPORTANT: Must be > 1.0. | |
Range: 1.0-2.0. Default: 1.0. Higher values encourage shorter sentences. | |
gpt_cond_len (int): Conditioning length for the GPT model. | |
Range: 10-50. Default: 30. Higher values use more context. | |
top_k (int): Top-K sampling parameter. 0 to disable top-k. | |
Range: 0-50. Default: 50. Lower values make output more focused. | |
top_p (float): Top-P (nucleus) sampling parameter. 0.0 to disable top-p. | |
Range: 0.0-1.0. Default: 0.85. Lower values make output more focused. | |
remove_silence_enabled (bool): Enable/disable automatic silence removal. Default: True. | |
silence_threshold (int): Silence threshold in dB for silence detection. | |
Range: -60 to -20. Default: -45. More negative = more sensitive to silence. | |
min_silence_len (int): Minimum length of silence to be removed in milliseconds. | |
Range: 300-1000. Default: 300. | |
keep_silence (int): Amount of silence to keep at the beginning/end in milliseconds. | |
Range: 100-500. Default: 100. | |
text_splitting_method (str): Method for splitting text. | |
Valid choices: 'Native XTTS splitting', 'Custom splitting', 'No splitting'. | |
Default: 'Native XTTS splitting'. Recommended for most use cases. | |
max_chars_per_segment (int): Max characters per segment when using 'Custom splitting'. | |
Range: 50-400. Default: 250. Only relevant when text_splitting_method = 'Custom splitting'. | |
enable_preprocessing (bool): Enable automatic text preprocessing to clean problematic characters. | |
Default: False. Recommended: True for better pronunciation. | |
Returns: | |
str: A URL to the generated MP3 audio file. | |
Examples: | |
Basic usage with example audio: | |
voice_clone_synthesis( | |
text="Hello world!", | |
example_audio_name="audio_1.wav", | |
language="English" | |
) | |
Advanced usage with custom parameters: | |
voice_clone_synthesis( | |
text="Bonjour le monde!", | |
example_audio_name="audio_2.wav", | |
language="French", | |
temperature=0.8, | |
speed=1.1, | |
repetition_penalty=2.0, # Note: > 1.0 required | |
length_penalty=1.2, # Note: > 1.0 required | |
enable_preprocessing=True | |
) | |
Raises: | |
gr.Error: If parameters are out of range or invalid combinations are used. | |
""" | |
# Validate and convert parameter types early for better AI agent feedback | |
temperature = float(temperature) | |
speed = float(speed) | |
repetition_penalty = float(repetition_penalty) | |
length_penalty = float(length_penalty) | |
gpt_cond_len = int(gpt_cond_len) | |
top_k = int(top_k) | |
top_p = float(top_p) | |
silence_threshold = int(silence_threshold) | |
min_silence_len = int(min_silence_len) | |
keep_silence = int(keep_silence) | |
max_chars_per_segment = int(max_chars_per_segment) | |
if not (0.1 <= temperature <= 1.5): | |
raise gr.Error(f"Temperature must be between 0.1 and 1.5, got {temperature}") | |
if not (0.5 <= speed <= 2.0): | |
raise gr.Error(f"Speed must be between 0.5 and 2.0, got {speed}") | |
if not (1.0 <= repetition_penalty <= 5.0): | |
raise gr.Error(f"Repetition penalty must be between 1.0 and 5.0, got {repetition_penalty}") | |
if not (1.0 <= length_penalty <= 2.0): | |
raise gr.Error(f"Length penalty must be between 1.0 and 2.0, got {length_penalty}") | |
if not (10 <= gpt_cond_len <= 50): | |
raise gr.Error(f"GPT conditioning length must be between 10 and 50, got {gpt_cond_len}") | |
if not (0 <= top_k <= 50): | |
raise gr.Error(f"Top-K must be between 0 and 50, got {top_k}") | |
if not (0.0 <= top_p <= 1.0): | |
raise gr.Error(f"Top-P must be between 0.0 and 1.0, got {top_p}") | |
if not (-60 <= silence_threshold <= -20): | |
raise gr.Error(f"Silence threshold must be between -60 and -20 dB, got {silence_threshold}") | |
if not (300 <= min_silence_len <= 1000): | |
raise gr.Error(f"Minimum silence length must be between 300 and 1000 ms, got {min_silence_len}") | |
if not (100 <= keep_silence <= 500): | |
raise gr.Error(f"Keep silence must be between 100 and 500 ms, got {keep_silence}") | |
if not (50 <= max_chars_per_segment <= 400): | |
raise gr.Error(f"Max characters per segment must be between 50 and 400, got {max_chars_per_segment}") | |
valid_splitting_methods = ["Native XTTS splitting", "Custom splitting", "No splitting"] | |
if text_splitting_method not in valid_splitting_methods: | |
raise gr.Error(f"Text splitting method must be one of {valid_splitting_methods}, got '{text_splitting_method}'") | |
valid_example_audios = ["audio_1.wav", "audio_2.wav", "audio_3.wav", "audio_4.wav", "audio_5.wav", "guzel_ses.wav", "guzel_ses_rapide.wav"] | |
if example_audio_name and example_audio_name not in valid_example_audios: | |
raise gr.Error(f"Example audio name must be one of {valid_example_audios}, got '{example_audio_name}'") | |
reference_audio_path = None | |
downloaded_path = None # To keep track of downloaded file for cleanup | |
# Ensure only one reference type is provided | |
if reference_audio_url and example_audio_name: | |
raise gr.Error("Please provide either 'reference_audio_url' or 'example_audio_name', but not both.") | |
if not reference_audio_url and not example_audio_name: | |
raise gr.Error("You must provide either 'reference_audio_url' or 'example_audio_name'.") | |
# Use the example audio if provided | |
if example_audio_name: | |
if example_audio_name not in file_path_mapping: | |
available_files = ", ".join(files_display) | |
raise gr.Error(f"Invalid example audio name. Available files are: {available_files}") | |
reference_audio_path = file_path_mapping[example_audio_name] | |
print(f"Using example audio: {reference_audio_path}") | |
# Otherwise, download from URL | |
if reference_audio_url: | |
print(f"Downloading reference audio from: {reference_audio_url}") | |
downloaded_path = download_audio_from_url(reference_audio_url) | |
if not downloaded_path: | |
raise gr.Error("Failed to download or process the reference audio from the provided URL.") | |
reference_audio_path = downloaded_path | |
# Validate the selected audio file | |
is_valid, error_message = validate_audio_file(reference_audio_path) | |
if not is_valid: | |
if downloaded_path and os.path.exists(downloaded_path): os.remove(downloaded_path) | |
raise gr.Error(error_message) | |
language_code = SUPPORTED_LANGUAGES.get(language) | |
if not language_code: | |
if downloaded_path and os.path.exists(downloaded_path): os.remove(downloaded_path) | |
raise gr.Error(f"Language '{language}' is not supported.") | |
audio_path, error, _ = synthesize_speech( | |
text=text, language=language_code, temperature=temperature, speed=speed, | |
reference_audio=reference_audio_path, do_sample=do_sample, | |
repetition_penalty=repetition_penalty, length_penalty=length_penalty, | |
gpt_cond_len=gpt_cond_len, top_k=top_k, top_p=top_p, | |
remove_silence_enabled=remove_silence_enabled, | |
silence_threshold=silence_threshold, min_silence_len=min_silence_len, | |
keep_silence=keep_silence, text_splitting_method=text_splitting_method, | |
max_chars_per_segment=max_chars_per_segment, | |
enable_preprocessing=enable_preprocessing | |
) | |
# Clean up downloaded file if it exists | |
if downloaded_path and os.path.exists(downloaded_path): | |
os.remove(downloaded_path) | |
if error: | |
raise gr.Error(error) | |
return audio_path | |
def analyze_text_for_speech(text: str, language: str): | |
""" | |
📊 Analyzes text for potential pronunciation and synthesis issues. | |
This tool examines text for elements that could be mispronounced by the TTS model, | |
such as special characters, numbers, URLs, and language-specific patterns. | |
It provides a structured report of potential issues. | |
Args: | |
text (str): The text to analyze. Required. | |
language (str): The language of the text. Required. | |
Supported languages: English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, | |
Russian, Dutch, Czech, Arabic, Chinese, Japanese, Korean, Hungarian, Hindi. | |
Note: Use exact language names (case-sensitive). | |
Returns: | |
dict: A dictionary containing the analysis results with these keys: | |
- standard_analysis_issues: List of detected issues with descriptions and suggestions | |
- has_issues: Boolean indicating if any issues were found | |
- xtts_cleaned_text: Preprocessed version of the text ready for synthesis | |
Example: | |
analyze_text_for_speech( | |
text="Hello! This costs $15.99 & includes free shipping.", | |
language="English" | |
) | |
Raises: | |
gr.Error: If the language is not supported. | |
""" | |
language_code = SUPPORTED_LANGUAGES.get(language) | |
if not language_code: | |
raise gr.Error(f"Language '{language}' is not supported.") | |
standard_analysis = analyze_text(text, language_code) | |
# tokenizer_analysis = analyze_with_tokenizer(text, language_code) | |
combined_issues = { | |
"standard_analysis_issues": standard_analysis.get('issues', []), | |
# "tokenizer_analysis_issues": tokenizer_analysis.get('issues', []), | |
"has_issues": standard_analysis.get('has_issues', False), # or tokenizer_analysis.get('has_issues', False), | |
"xtts_cleaned_text": preprocess_text(text, language_code) # tokenizer_analysis.get('cleaned_text', text) | |
} | |
return combined_issues | |
def preprocess_text_for_speech(text: str, language: str): | |
""" | |
🔧 Preprocesses and cleans text for optimal speech synthesis. | |
This tool applies a series of cleaning and normalization rules to the input text | |
to improve its compatibility with the TTS model. This includes handling numbers, | |
special characters, URLs, and applying language-specific typographical rules. | |
Args: | |
text (str): The text to preprocess. Required. | |
language (str): The language of the text. Required. | |
Supported languages: English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, | |
Russian, Dutch, Czech, Arabic, Chinese, Japanese, Korean, Hungarian, Hindi. | |
Note: Use exact language names (case-sensitive). | |
Returns: | |
str: The cleaned and preprocessed text ready for speech synthesis. | |
Example: | |
preprocess_text_for_speech( | |
text="Visit https://example.com & pay $25.50!", | |
language="English" | |
) | |
# Returns: "Visit example.com and pay twenty-five dollars and fifty cents!" | |
Raises: | |
gr.Error: If the language is not supported. | |
""" | |
language_code = SUPPORTED_LANGUAGES.get(language) | |
if not language_code: | |
raise gr.Error(f"Language '{language}' is not supported.") | |
return preprocess_text(text, language_code, apply_replacements=True) | |
# Example texts for different languages | |
EXAMPLE_TEXTS = { | |
"fr": "Bonjour, je suis une voix générée par intelligence artificielle. Comment puis-je vous aider aujourd'hui?", | |
"en": "Hello, I am a voice generated by artificial intelligence. How may I assist you today?", | |
"es": "Hola, soy una voz generada por inteligencia artificial. ¿Cómo puedo ayudarte hoy?", | |
"de": "Hallo, ich bin eine von künstlicher Intelligenz generierte Stimme. Wie kann ich Ihnen heute helfen?", | |
"it": "Ciao, sono una voce generata dall'intelligenza artificiale. Come posso aiutarti oggi?", | |
"pt": "Olá, sou uma voz gerada por inteligência artificial. Como posso ajudá-lo hoje?", | |
"ar": "مرحبا، أنا صوت تم إنشاؤه بواسطة الذكاء الاصطناعي. كيف يمكنني مساعدتك اليوم؟", | |
"zh-cn": "你好,我是由人工智能生成的声音。今天我能为您提供什么帮助?", | |
"ja": "こんにちは、私は人工知能によって生成された音声です。今日はどのようにお手伝いできますか?", | |
"ko": "안녕하세요, 저는 인공지능으로 생성된 목소리입니다. 오늘 어떻게 도와드릴까요?", | |
"ru": "Здравствуйте, я голос, сгенерированный искусственным интеллектом. Чем я могу вам помочь сегодня?", | |
"nl": "Hallo, ik ben een stem gegenereerd door kunstmatige intelligentie. Hoe kan ik u vandaag helpen?", | |
"cs": "Dobrý den, jsem hlas vytvořený umělou inteligencí. Jak vám mohu dnes pomoci?", | |
"pl": "Dzień dobry, jestem głosem wygenerowanym przez sztuczną inteligencję. Jak mogę ci dziś pomóc?", | |
"tr": "Merhaba, ben yapay zeka tarafından oluşturulan bir sesim. Bugün size nasıl yardımcı olabilirim?", | |
"hu": "Üdvözlöm, én egy mesterséges intelligencia által generált hang vagyok. Hogyan segíthetek ma?", | |
"hi": "नमस्ते, मैं कृत्रिम बुद्धिमत्ता द्वारा उत्पन्न एक आवाज हूं। मैं आज आपकी कैसे मदद कर सकता हूं?" | |
} | |
# Function to analyze text with the XTTS tokenizer | |
def analyze_with_tokenizer(text, language_code): | |
""" | |
Analyzes text using the XTTS model's tokenizer to detect | |
parts that may be problematic for pronunciation. | |
Args: | |
text (str): The text to analyze | |
language_code (str): Language code (fr, en, etc.) | |
Returns: | |
dict: A dictionary containing detected issues and suggestions | |
""" | |
import torch | |
from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners | |
issues = [] | |
original_text = text | |
try: | |
# 1. Run the same preprocessing as the XTTS model uses internally | |
cleaned_text = text | |
print(f"Using XTTS cleaners for language: {language_code}") | |
# The multilingual_cleaners object is a dictionary mapping language codes to cleaner functions. | |
if language_code in multilingual_cleaners: | |
cleaner_fn = multilingual_cleaners[language_code] | |
cleaned_text = cleaner_fn(text) | |
else: | |
# If no specific cleaner is available, we just use the original text. | |
# The TTS model will use its default basic cleaners internally. | |
print(f"No specific cleaner for language {language_code}, using original text for analysis.") | |
cleaned_text = text | |
# 2. Tokenize the text as XTTS would | |
# Compare the original and cleaned text to detect changes | |
if original_text != cleaned_text: | |
# Find the parts that have been modified | |
import difflib | |
# Create an object to compare the two texts | |
differ = difflib.Differ() | |
diff = list(differ.compare(original_text.split(), cleaned_text.split())) | |
# Find the words that have been removed or changed | |
modified_words = [] | |
for d in diff: | |
if d.startswith('- '): | |
word = d[2:] | |
if len(word) > 1: # Ignore individual characters | |
modified_words.append(word) | |
if modified_words: | |
issues.append({ | |
'type': 'tokenizer_changes', | |
'description': 'Words that might be mispronounced', | |
'instances': modified_words, | |
'suggestion': 'Consider reformulating these parts or using automatic preprocessing' | |
}) | |
# 3. Check for words out of vocabulary (OOV) using the XTTS tokenizer | |
# This part would require accessing the tokenizer's vocabulary, | |
# which might not be directly accessible. | |
# 4. Check for rare words that might be mispronounced | |
words = text.split() | |
long_words = [w for w in words if len(w) > 12] # Extremely long words | |
if long_words: | |
issues.append({ | |
'type': 'long_words', | |
'description': 'Extremely long words that might be mispronounced', | |
'instances': long_words, | |
'suggestion': 'Check if these words are pronounced correctly, try splitting them or reformulating' | |
}) | |
# 5. Check for special characters that are preserved after cleaning | |
import re | |
special_chars = re.findall(r'[^a-zA-Z0-9\s.,;:!?\'"-]', cleaned_text) | |
if special_chars: | |
unique_special_chars = list(set(special_chars)) | |
issues.append({ | |
'type': 'special_chars_preserved', | |
'description': 'Special characters preserved by the tokenizer', | |
'instances': unique_special_chars, | |
'suggestion': 'These characters might cause pronunciation issues' | |
}) | |
return { | |
'issues': issues, | |
'has_issues': len(issues) > 0, | |
'cleaned_text': cleaned_text | |
} | |
except Exception as e: | |
print(f"Error in tokenizer analysis: {e}") | |
return { | |
'issues': [{ | |
'type': 'analysis_error', | |
'description': 'Error during analysis with the tokenizer', | |
'instances': [str(e)], | |
'suggestion': 'Technical error, please try again' | |
}], | |
'has_issues': True, | |
'cleaned_text': text | |
} | |
# Function to combine both analyses | |
def combined_analysis(text, language): | |
"""Perform comprehensive text analysis for optimal voice synthesis quality. | |
This function combines standard text analysis with XTTS tokenizer analysis | |
to detect and report all potential issues that might affect speech synthesis. | |
Args: | |
text: The text to analyze for speech synthesis compatibility | |
language: Language name (English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Hungarian, Korean, Japanese, Hindi) | |
Returns: | |
A tuple containing detailed analysis report and cleaned text ready for synthesis | |
""" | |
language_code = SUPPORTED_LANGUAGES[language] | |
# Run standard analysis | |
standard_analysis = analyze_text(text, language_code) | |
# Run analysis with tokenizer | |
tokenizer_analysis = analyze_with_tokenizer(text, language_code) | |
# Combine results | |
display_text = format_issues_for_display(standard_analysis, language_code, tokenizer_analysis) | |
# Get the preprocessed text (prefer the result from the tokenizer if it exists) | |
cleaned_text = tokenizer_analysis.get('cleaned_text', "") | |
if not cleaned_text or cleaned_text == text: | |
cleaned_text = preprocess_text(text, language_code) if text else "" | |
return display_text, cleaned_text | |
def cleanup_old_files(max_age_minutes=60): | |
""" | |
Optimized: deletes temporary files older than max_age_minutes. | |
This function can be called regularly to prevent accumulation of files. | |
""" | |
try: | |
now = time.time() | |
count_removed = 0 | |
# Clean temporary files | |
for file in TEMP_DIR.glob("*"): | |
if file.is_file(): | |
file_age_minutes = (now - os.path.getmtime(file)) / 60 | |
if file_age_minutes > max_age_minutes: | |
os.remove(file) | |
count_removed += 1 | |
# Clean old output files | |
for file in OUTPUT_DIR.glob("*.mp3"): | |
if file.is_file(): | |
file_age_days = (now - os.path.getmtime(file)) / (24 * 60 * 60) | |
if file_age_days > 7: # Keep one week | |
os.remove(file) | |
count_removed += 1 | |
return count_removed | |
except Exception as e: | |
return 0 | |
# Create interface with Gradio Blocks | |
with gr.Blocks(theme=gr.themes.Ocean(), css=""" | |
.gradio-container { | |
max-width: 1280px !important; | |
margin: auto !important; | |
} | |
#header { | |
display: flex; | |
justify-content: center; | |
align-items: center; | |
padding: 10px 0; | |
} | |
""") as interface: | |
with gr.Row(elem_id="header"): | |
gr.Markdown( | |
""" | |
<div style="text-align: center;"> | |
<h1 style="margin: 0; font-size: 1.8rem;">🎙️ Voice Cloning Studio</h1> | |
<p style="margin: 0; font-size: 1rem;">Bring any voice to life from a 3-second audio sample.</p> | |
</div> | |
""" | |
) | |
# Get all reference audio files and simplify their display | |
try: | |
files_paths = [str(f) for f in REF_AUDIO_DIR.iterdir() if f.is_file() and f.suffix.lower() in ['.wav', '.mp3']] | |
files_display = [os.path.basename(f) for f in files_paths] | |
file_path_mapping = dict(zip(files_display, files_paths)) | |
except Exception as e: | |
files_paths = [] | |
files_display = [] | |
file_path_mapping = {} | |
with gr.Row(equal_height=False): | |
# LEFT COLUMN: Inputs & Settings | |
with gr.Column(scale=2): | |
with gr.Tabs(): | |
with gr.TabItem("1. Voice"): | |
gr.Markdown("### Select a Reference Voice") | |
gr.Markdown("Choose a pre-defined example or upload your own 3-10 second audio clip. For best results, use a clear, high-quality recording with no background noise.") | |
example_audio_dropdown = gr.Dropdown( | |
choices=files_display, | |
label="Reference Audio (from examples)", | |
value=files_display[0] if files_display else None, | |
interactive=True | |
) | |
reference_audio_input = gr.Audio( | |
label="Reference Audio (upload your own)", | |
type="filepath" | |
) | |
with gr.TabItem("2. Text & Language"): | |
gr.Markdown("### Enter Text and Select Language") | |
lang_dropdown = gr.Dropdown( | |
choices=list(SUPPORTED_LANGUAGES.keys()), | |
value="English", | |
label="Language" | |
) | |
text_input = gr.Textbox( | |
label="Text to Synthesize", | |
placeholder="Enter text here...", | |
lines=5, | |
value="Hello, I am a voice generated by artificial intelligence. How may I assist you today?" | |
) | |
with gr.Row(): | |
example_buttons = [] | |
example_langs_to_show = ["en", "fr", "es", "de", "zh-cn"] | |
for lang in example_langs_to_show: | |
if lang in EXAMPLE_TEXTS: | |
example_buttons.append(gr.Button(f"Example ({lang.upper()})")) | |
with gr.Accordion("Text Analysis & Preprocessing", open=True): | |
with gr.Row(): | |
analyze_button = gr.Button("Analyze Text") | |
enable_preprocessing = gr.Checkbox( | |
value=False, | |
label="Preprocess text automatically" | |
) | |
text_analysis_output = gr.Textbox( | |
label="Text Analysis", | |
value="Click 'Analyze Text' to see results here.", | |
lines=6 | |
) | |
preprocessed_text_output = gr.Textbox( | |
label="Preprocessed Text", | |
value="The processed text will appear here after analysis or generation.", | |
lines=3, | |
visible=True | |
) | |
with gr.TabItem("3. Settings"): | |
gr.Markdown("### Fine-Tune Your Audio") | |
gr.Markdown("Adjust these settings to control the style and quality of the generated speech.") | |
with gr.Accordion("Generation Settings", open=True): | |
with gr.Row(): | |
with gr.Column(): | |
temperature_slider = gr.Slider(minimum=0.1, maximum=1.5, step=0.05, value=0.75, label="Temperature") | |
speed_slider = gr.Slider(minimum=0.5, maximum=2.0, step=0.05, value=1.0, label="Speed") | |
do_sample = gr.Checkbox(value=True, label="Enable Sampling (do_sample)") | |
with gr.Column(): | |
repetition_penalty = gr.Slider(minimum=1.0, maximum=5.0, step=0.1, value=5.0, label="Repetition Penalty") | |
length_penalty = gr.Slider(minimum=1.0, maximum=2.0, step=0.1, value=1.0, label="Length Penalty") | |
gpt_cond_len = gr.Slider(minimum=10, maximum=50, step=1, value=30, label="GPT Conditioning Length") | |
top_k = gr.Slider(minimum=0, maximum=50, step=1, value=50, label="Top-K") | |
top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.85, label="Top-P") | |
with gr.Accordion("Text Splitting", open=False): | |
text_splitting_method = gr.Radio( | |
choices=["Native XTTS splitting", "Custom splitting", "No splitting"], | |
value="Native XTTS splitting", | |
label="Text Splitting Method" | |
) | |
enable_text_splitting = gr.Checkbox( | |
value=True, | |
label="enable_text_splitting (XTTS parameter)", | |
visible=False | |
) | |
max_chars_per_segment = gr.Slider( | |
minimum=50, | |
maximum=400, | |
step=10, | |
value=250, | |
label="Max characters per segment", | |
visible=False | |
) | |
with gr.Accordion("Silence Removal", open=False): | |
remove_silence_enabled = gr.Checkbox(value=True, label="Remove silences from audio") | |
silence_threshold = gr.Slider(minimum=-60, maximum=-20, step=5, value=-45, label="Silence threshold (dB)") | |
min_silence_len = gr.Slider(minimum=300, maximum=1000, step=50, value=300, label="Minimum silence length (ms)") | |
keep_silence = gr.Slider(minimum=100, maximum=500, step=10, value=100, label="Silence to keep (ms)") | |
# RIGHT COLUMN: Output | |
with gr.Column(scale=1): | |
gr.Markdown("### 4. Generate & Listen") | |
gr.Markdown("Click the button to generate your audio. Results will appear below.") | |
generate_button = gr.Button("Generate Audio", variant="primary", scale=1) | |
output_audio = gr.Audio(label="Generated Audio") | |
output_message = gr.Textbox(label="Status & Tips", visible=True, lines=8) | |
with gr.Accordion("User Guide, Disclaimer & API Info", open=False): | |
with gr.Tabs(): | |
with gr.TabItem("🎯 Quick Start Guide"): | |
gr.Markdown(""" | |
## 🎯 Quick User Guide | |
1. **Choose a reference voice**: In the **Voice** tab, select an example from the dropdown or upload your own clear audio file (3-10 seconds). | |
2. **Enter your text**: In the **Text & Language** tab, type or paste the text you want to synthesize and select the correct language. | |
3. **Generate**: Click the "Generate Audio" button. | |
4. **Iterate**: If you're not happy with the result, try regenerating. Small changes to the settings in the **Settings** tab can produce different results. | |
### 🔍 Essential Tips | |
- **Reference Audio Quality**: The quality of the generated audio heavily depends on the reference. Use clean recordings with no background noise. | |
- **Text Preprocessing**: Keep "Preprocess text automatically" enabled. It improves pronunciation of numbers, symbols, and URLs. Use the "Analyze Text" button to see potential issues. | |
- **Optimizing Results**: For long texts, "Native XTTS splitting" is recommended. To change the speech style, try regenerating, adjusting the `Temperature`, or changing the `Speed`. | |
- **Languages**: Ensure the selected language matches the text. | |
""") | |
with gr.TabItem("⚠️ Disclaimer"): | |
gr.Markdown(""" | |
## ⚠️ Disclaimer and Legal Notice | |
**By using this voice cloning application, you acknowledge and agree to the following:** | |
1. This application is provided "as is" without any warranties of any kind, either express or implied. | |
2. The creator(s) of this application accept no responsibility or liability for any misuse of the technology. | |
3. You are solely responsible for obtaining proper consent when cloning someone else's voice. | |
4. You agree not to use this technology for deceptive, harmful, or illegal purposes. | |
5. Voice cloning results may vary in quality and accuracy; no specific results are guaranteed. | |
6. You understand that voice cloning technology has ethical implications and agree to use it responsibly. | |
The technology is intended for legitimate creative, educational, and accessibility purposes only. | |
--- | |
### License & Model Information | |
By accessing or using any feature within this space, you acknowledge and accept the terms of the following license: [https://coqui.ai/cpml](https://coqui.ai/cpml). | |
**Model source:** [coqui/XTTS-v2](https://huggingface.co/coqui/XTTS-v2) | |
""") | |
with gr.TabItem("🔧 API Tools"): | |
gr.Markdown(f""" | |
## 🛠️ Model Context Protocol (MCP) Tools | |
This application exposes MCP tools that you can use with LLMs. | |
**MCP Endpoint:** `https://hasanbasbunar-voice-cloning-xtts-v2.hf.space/gradio_api/mcp/sse` | |
--- | |
### 🎤 `voice_clone_synthesis` | |
Generates an audio file by cloning a voice from a reference audio file (provided via URL or a local example). | |
**Parameters:** | |
- `text` (string, required): The text to synthesize. | |
- `reference_audio_url` (string, optional): A public URL for a reference audio file (WAV, MP3). **Provide this OR `example_audio_name`.** | |
- `example_audio_name` (string, optional): The name of a predefined example audio file. **Provide this OR `reference_audio_url`.** Available files are: {', '.join(files_display)}. | |
- `language` (string, optional): The language of the text. Default: "English". | |
- ... (and other advanced parameters, see the function's docstring for a full list). | |
**Returns:** | |
- `string`: A URL to the generated MP3 audio file. | |
--- | |
### 📊 `analyze_text_for_speech` | |
Analyzes text for potential pronunciation issues. | |
**Parameters:** | |
- `text` (string, required): The text to analyze. | |
- `language` (string, required): The language of the text. | |
**Returns:** | |
- `object`: A JSON object with the detected issues. | |
--- | |
### 🔧 `preprocess_text_for_speech` | |
Cleans and preprocesses text for optimal speech synthesis. | |
**Parameters:** | |
- `text` (string, required): The text to preprocess. | |
- `language` (string, required): The language of the text. | |
**Returns:** | |
- `string`: The cleaned text. | |
""") | |
# Functions for example texts | |
for i, lang_code in enumerate(example_langs_to_show): | |
if lang_code in EXAMPLE_TEXTS: | |
lang_name = next((k for k, v in SUPPORTED_LANGUAGES.items() if v == lang_code), None) | |
if lang_name: | |
example_buttons[i].click( | |
lambda t, l: (t, l), | |
inputs=[gr.Textbox(value=EXAMPLE_TEXTS[lang_code], visible=False), gr.Textbox(value=lang_name, visible=False)], | |
outputs=[text_input, lang_dropdown], | |
api_name=False | |
) | |
# Function to analyze text and display results | |
def analyze_input_text(text, language): | |
language_code = SUPPORTED_LANGUAGES[language] | |
analysis = analyze_text(text, language_code) | |
display_text = format_issues_for_display(analysis, language_code) | |
# Preprocess text and display it | |
preprocessed = preprocess_text(text, language_code) if text else "" | |
return display_text, preprocessed | |
# Connect event handlers for text analysis | |
text_input.change( | |
analyze_input_text, | |
inputs=[text_input, lang_dropdown], | |
outputs=[text_analysis_output, preprocessed_text_output], | |
api_name=False | |
) | |
lang_dropdown.change( | |
analyze_input_text, | |
inputs=[text_input, lang_dropdown], | |
outputs=[text_analysis_output, preprocessed_text_output], | |
api_name=False | |
) | |
analyze_button.click( | |
combined_analysis, | |
inputs=[text_input, lang_dropdown], | |
outputs=[text_analysis_output, preprocessed_text_output], | |
api_name=False | |
) | |
# Function to validate audio files | |
def validate_audio_file(file_path, max_size_mb=20, min_duration_sec=1, max_duration_sec=60): | |
""" | |
Validates audio files to ensure they are valid, have appropriate size and duration. | |
Args: | |
file_path (str): Path to the audio file | |
max_size_mb (int): Maximum file size in MB | |
min_duration_sec (float): Minimum duration in seconds | |
max_duration_sec (float): Maximum duration in seconds | |
Returns: | |
tuple: (is_valid, error_message) | |
""" | |
# Check if file exists | |
if not os.path.exists(file_path): | |
return False, "Error: File does not exist" | |
# Check file extension | |
file_ext = os.path.splitext(file_path)[1].lower() | |
if file_ext not in ['.mp3', '.wav']: | |
return False, f"Error: Invalid file format {file_ext}. Only MP3 and WAV files are supported." | |
# Check file size | |
file_size_mb = os.path.getsize(file_path) / (1024 * 1024) | |
if file_size_mb > max_size_mb: | |
return False, f"Error: File size ({file_size_mb:.1f} MB) exceeds the maximum allowed size ({max_size_mb} MB)" | |
try: | |
# Check audio duration and integrity | |
if file_ext == '.mp3': | |
audio = AudioSegment.from_mp3(file_path) | |
else: | |
audio = AudioSegment.from_wav(file_path) | |
duration_sec = len(audio) / 1000 | |
if duration_sec < min_duration_sec: | |
return False, f"Error: Audio duration ({duration_sec:.1f} sec) is too short (min: {min_duration_sec} sec)" | |
if duration_sec > max_duration_sec: | |
return False, f"Error: Audio duration ({duration_sec:.1f} sec) is too long (max: {max_duration_sec} sec)" | |
# Additional check for very quiet audio | |
if audio.dBFS < -50: | |
return True, "Warning: Audio is very quiet, which may result in poor voice cloning quality" | |
return True, None | |
except Exception as e: | |
return False, f"Error: Failed to process audio file - {str(e)}" | |
def handle_synthesis_request( | |
text, language, temperature, speed, reference_audio, example_audio_name, | |
do_sample, enable_text_splitting, repetition_penalty, length_penalty, | |
gpt_cond_len, top_k, top_p, remove_silence_enabled, silence_threshold, | |
min_silence_len, keep_silence, text_splitting_method, max_chars_per_segment, | |
enable_preprocessing | |
): | |
""" | |
Gradio callback to handle the "Generate Audio" button click. | |
This function orchestrates the synthesis process by: | |
1. Selecting and validating the reference audio. | |
2. Calling the main `synthesize_speech` function. | |
3. Formatting the output (audio and messages) for the Gradio interface. | |
""" | |
language_code = SUPPORTED_LANGUAGES[language] | |
# Ensure penalties are float | |
repetition_penalty = float(repetition_penalty) | |
length_penalty = float(length_penalty) | |
# Select reference audio | |
final_reference_audio = reference_audio | |
if not final_reference_audio and example_audio_name: | |
final_reference_audio = file_path_mapping.get(example_audio_name) | |
# Validate reference audio | |
if final_reference_audio: | |
is_valid, error_message = validate_audio_file(final_reference_audio) | |
if not is_valid: | |
return None, error_message, "" | |
# Call the main synthesis function | |
audio_path, error_message, preprocessed_text = synthesize_speech( | |
text=text, | |
language=language_code, | |
temperature=temperature, | |
speed=speed, | |
reference_audio=final_reference_audio, | |
do_sample=do_sample, | |
repetition_penalty=repetition_penalty, | |
length_penalty=length_penalty, | |
gpt_cond_len=gpt_cond_len, | |
top_k=top_k, | |
top_p=top_p, | |
remove_silence_enabled=remove_silence_enabled, | |
silence_threshold=silence_threshold, | |
min_silence_len=min_silence_len, | |
keep_silence=keep_silence, | |
text_splitting_method=text_splitting_method, | |
max_chars_per_segment=max_chars_per_segment, | |
enable_preprocessing=enable_preprocessing | |
) | |
if error_message: | |
return None, error_message, preprocessed_text | |
success_message = f""" | |
✅ Audio generation successful! | |
💾 Use the download button to save the audio. | |
🔄 If you're not satisfied with the result (e.g., pronunciation, intonation, or pace), feel free to click "Generate Audio" again. | |
ℹ️ The generation process includes randomness controlled by the temperature parameter ({temperature:.2f}), so each output is unique. | |
🎤 For different results, try another voice from the "Reference Audio (examples)" dropdown or upload your own. | |
⚙️ If the result is still not satisfactory after several attempts, consider adjusting parameters in the "Advanced Settings" accordion. | |
""" | |
return audio_path, success_message, preprocessed_text | |
generate_button.click( | |
handle_synthesis_request, | |
inputs=[ | |
text_input, lang_dropdown, temperature_slider, speed_slider, | |
reference_audio_input, example_audio_dropdown, do_sample, | |
enable_text_splitting, repetition_penalty, length_penalty, | |
gpt_cond_len, top_k, top_p, remove_silence_enabled, | |
silence_threshold, min_silence_len, keep_silence, | |
text_splitting_method, max_chars_per_segment, enable_preprocessing | |
], | |
outputs=[output_audio, output_message, preprocessed_text_output], | |
api_name=False | |
) | |
# Function to update visibility and value of fields based on the splitting method | |
def update_text_splitting_options(method): | |
# Update the state of enable_text_splitting based on the selected method | |
is_native = method == "Native XTTS splitting" | |
is_custom = method == "Custom splitting" | |
# Value of the enable_text_splitting checkbox | |
enable_splitting = is_native | |
# Visibility of the max_chars_per_segment slider | |
show_max_chars = is_custom | |
return gr.update(value=enable_splitting), gr.update(visible=show_max_chars) | |
# Connect the function to the radio button change event | |
text_splitting_method.change( | |
update_text_splitting_options, | |
inputs=[text_splitting_method], | |
outputs=[enable_text_splitting, max_chars_per_segment], | |
api_name=False | |
) | |
# Section for API endpoints (hidden from UI) | |
with gr.Tab("API Endpoints", visible=False): | |
# API: voice_clone_synthesis | |
with gr.Row(): | |
api_synth_text = gr.Textbox(label="Text") | |
api_synth_ref_url = gr.Textbox(label="Reference Audio URL") | |
api_synth_example_name = gr.Dropdown(files_display, label="Example Audio Name") | |
api_synth_lang = gr.Dropdown(list(SUPPORTED_LANGUAGES.keys()), label="Language", value="English") | |
api_synth_temp = gr.Slider(minimum=0.1, maximum=1.5, value=0.75, label="Temperature") | |
api_synth_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="Speed") | |
api_synth_do_sample = gr.Checkbox(value=True, label="Do Sample") | |
api_synth_rep_penalty = gr.Slider(minimum=1.0, maximum=5.0, value=5.0, label="Repetition Penalty") | |
api_synth_len_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.0, label="Length Penalty") | |
api_synth_gpt_cond_len = gr.Slider(minimum=10, maximum=50, value=30, label="GPT Cond Length") | |
api_synth_top_k = gr.Slider(minimum=0, maximum=50, value=50, label="Top K") | |
api_synth_top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.85, label="Top P") | |
api_synth_remove_silence = gr.Checkbox(value=True, label="Remove Silence") | |
api_synth_silence_thresh = gr.Slider(minimum=-60, maximum=-20, value=-45, label="Silence Threshold") | |
api_synth_min_silence_len = gr.Slider(minimum=300, maximum=1000, value=300, label="Min Silence Length") | |
api_synth_keep_silence = gr.Slider(minimum=100, maximum=500, value=100, label="Keep Silence") | |
api_synth_split_method = gr.Radio(choices=["Native XTTS splitting", "Custom splitting", "No splitting"], value="Native XTTS splitting", label="Splitting Method") | |
api_synth_max_chars = gr.Slider(minimum=50, maximum=400, value=250, label="Max Chars") | |
api_synth_preprocess = gr.Checkbox(value=False, label="Enable Preprocessing") | |
api_synth_output_audio = gr.Audio(label="Generated Audio") | |
api_synth_trigger = gr.Button("Synthesize_API") | |
# API: analyze_text_for_speech | |
with gr.Row(): | |
api_analyze_text = gr.Textbox(label="Text") | |
api_analyze_lang = gr.Dropdown(list(SUPPORTED_LANGUAGES.keys()), label="Language", value="English") | |
api_analyze_output = gr.JSON(label="Analysis Result") | |
api_analyze_trigger = gr.Button("Analyze_API") | |
# API: preprocess_text_for_speech | |
with gr.Row(): | |
api_preprocess_text = gr.Textbox(label="Text") | |
api_preprocess_lang = gr.Dropdown(list(SUPPORTED_LANGUAGES.keys()), label="Language", value="English") | |
api_preprocess_output = gr.Textbox(label="Preprocessed Text") | |
api_preprocess_trigger = gr.Button("Preprocess_API") | |
# Hook API names to the triggers | |
api_synth_trigger.click( | |
fn=voice_clone_synthesis, | |
inputs=[ | |
api_synth_text, api_synth_ref_url, api_synth_example_name, api_synth_lang, api_synth_temp, | |
api_synth_speed, api_synth_do_sample, api_synth_rep_penalty, | |
api_synth_len_penalty, api_synth_gpt_cond_len, api_synth_top_k, | |
api_synth_top_p, api_synth_remove_silence, api_synth_silence_thresh, | |
api_synth_min_silence_len, api_synth_keep_silence, api_synth_split_method, | |
api_synth_max_chars, api_synth_preprocess | |
], | |
outputs=[api_synth_output_audio], | |
api_name="voice_clone_synthesis" | |
) | |
api_analyze_trigger.click( | |
fn=analyze_text_for_speech, | |
inputs=[api_analyze_text, api_analyze_lang], | |
outputs=[api_analyze_output], | |
api_name="analyze_text_for_speech" | |
) | |
api_preprocess_trigger.click( | |
fn=preprocess_text_for_speech, | |
inputs=[api_preprocess_text, api_preprocess_lang], | |
outputs=[api_preprocess_output], | |
api_name="preprocess_text_for_speech" | |
) | |
if __name__ == "__main__": | |
# Setup periodic cleanup task to run every hour | |
def periodic_cleanup(): | |
"""Run cleanup task periodically in background""" | |
while True: | |
try: | |
# Sleep for 60 minutes | |
time.sleep(60 * 60) | |
# Run cleanup | |
files_removed = cleanup_old_files(max_age_minutes=60) | |
except Exception as e: | |
print(f"Error in background cleanup task: {e}") | |
# Start cleanup thread | |
cleanup_thread = threading.Thread(target=periodic_cleanup, daemon=True) | |
cleanup_thread.start() | |
# Launch main interface with MCP enabled directly | |
interface.queue() | |
interface.launch(share=False, allowed_paths=[str(REF_AUDIO_DIR)]) |