import os # Activer le serveur MCP os.environ['GRADIO_MCP_SERVER'] = 'True' import gradio as gr import torchaudio import torch from pydub import AudioSegment, effects import uuid import subprocess import time import nltk from nltk.tokenize import sent_tokenize from pathlib import Path import sys from pydub.silence import split_on_silence import re from unicodedata import normalize import numpy as np import spaces from huggingface_hub import snapshot_download import threading import requests import tempfile # Télécharger les ressources NLTK nltk.download("punkt", quiet=True) nltk.download("punkt_tab", quiet=True) # Definition of problematic characters by language PROBLEMATIC_CHARS = { 'global': ['&', '%', '@', '#', '$', '*', '+', '=', '()', '[]', '{}', '<>', '|', '/', '\\', '"', '…', '«', '»', '"', '"', ''', '''], 'fr': ['&', '%', '@', '#', '$', '*', '+', '=', 'etc.'], 'en': ['&', '%', '@', '#', '$', '*', '+', '=', 'etc.'], # Add specific characters for each language as needed } # Replacement rules by language REPLACEMENT_RULES = { 'global': { '&': {'fr': ' et ', 'en': ' and ', 'es': ' y ', 'de': ' und ', 'it': ' e ', 'pt': ' e ', 'default': ' and '}, '%': {'fr': ' pourcent ', 'en': ' percent ', 'de': ' prozent ', 'default': ' percent '}, '@': {'fr': ' arobase ', 'en': ' at ', 'default': ' at '}, '#': {'fr': ' hashtag ', 'en': ' hashtag ', 'default': ' hashtag '}, '...': {'default': ', '}, '…': {'default': ', '}, '"': {'default': ''}, "'": {'default': ''}, '«': {'default': ''}, '»': {'default': ''}, '"': {'default': ''}, '"': {'default': ''}, ''': {'default': ''}, ''': {'default': ''}, }, # You can add language-specific rules } def analyze_text(text, language_code): """Analyze text to detect potential pronunciation issues for voice synthesis. This function examines text for problematic characters, special symbols, URLs, numbers, and other elements that might affect speech quality in voice cloning. Args: text: The text to analyze for speech synthesis compatibility language_code: Language code (en, fr, es, de, it, pt, pl, tr, ru, nl, cs, ar, zh, hu, ko, ja, hi) Returns: Dictionary containing detected issues and suggestions for improvement """ issues = [] # Basic unicode normalization normalized_text = normalize('NFC', text) # Détection des emojis import re emoji_pattern = re.compile( "[" "\U0001F600-\U0001F64F" # emoticons "\U0001F300-\U0001F5FF" # symbols & pictographs "\U0001F680-\U0001F6FF" # transport & map symbols "\U0001F700-\U0001F77F" # alchemical symbols "\U0001F780-\U0001F7FF" # Geometric Shapes "\U0001F800-\U0001F8FF" # Supplemental Arrows-C "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs "\U0001FA00-\U0001FA6F" # Chess Symbols "\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A "\U00002702-\U000027B0" # Dingbats "\U000024C2-\U0001F251" "]+", flags=re.UNICODE ) emojis = emoji_pattern.findall(text) if emojis: issues.append({ 'type': 'emojis', 'description': 'Emojis that will be removed during preprocessing', 'instances': emojis, 'suggestion': 'Emojis are replaced with spaces for better pronunciation' }) # URL detection urls = re.findall(r'https?://\S+|www\.\S+', text) if urls: issues.append({ 'type': 'url', 'description': 'Detected URLs that may be mispronounced', 'instances': urls, 'suggestion': 'Replace URLs with textual descriptions' }) # Email detection emails = re.findall(r'\S+@\S+\.\S+', text) if emails: issues.append({ 'type': 'email', 'description': 'Detected email addresses that may be mispronounced', 'instances': emails, 'suggestion': 'Replace emails with descriptive text' }) # Detection of quotes and citation characters (completely exclude apostrophe) quote_chars = ['"', '«', '»', '"', '"', ''', '''] found_quotes = [] # For English, completely exclude apostrophes from problematic characters if language_code == 'en': # Don't report apostrophes as problematic in English pass else: # Look only for quotes, not apostrophes for char in quote_chars: if char in text: found_quotes.append(char) if found_quotes: issues.append({ 'type': 'quotes', 'description': 'Quotes and citation characters that may affect pronunciation', 'instances': found_quotes, 'suggestion': 'Remove quotes and citation characters for better pronunciation' }) # Detection of problematic characters (exclude apostrophes) global_chars = [c for c in PROBLEMATIC_CHARS.get('global', []) if c != "'"] lang_specific_chars = PROBLEMATIC_CHARS.get(language_code, []) all_problematic_chars = set(global_chars + lang_specific_chars) - set(quote_chars) # Exclude quotes already treated found_chars = [] for char in all_problematic_chars: if char in text: found_chars.append(char) if found_chars: issues.append({ 'type': 'special_chars', 'description': 'Special characters that may cause pronunciation problems', 'instances': found_chars, 'suggestion': 'Replace special characters with their textual equivalent' }) # Detection of long numbers (beyond 3 digits) numbers = re.findall(r'\b\d{4,}\b', text) if numbers: suggestion = "Write numbers in full" if language_code == 'fr': suggestion += " or add spaces between thousands (e.g., 10 000)" elif language_code == 'en': suggestion += " or use commas for thousands (e.g., 10,000)" issues.append({ 'type': 'numbers', 'description': 'Long numbers that may be mispronounced', 'instances': numbers, 'suggestion': suggestion }) # Detection of Roman numerals, with exception for the pronoun "I" in English if language_code == 'en': # In English, exclude "I" as a Roman numeral because it's a personal pronoun roman_pattern = r'\b(?!I\b)[IVXLCDM]+\b' roman_numerals = re.findall(roman_pattern, text) if roman_numerals: issues.append({ 'type': 'roman_numerals', 'description': 'Roman numerals that may be mispronounced', 'instances': roman_numerals, 'suggestion': 'Replace Roman numerals with Arabic numbers' }) else: # For other languages, keep normal detection roman_pattern = r'\b[IVXLCDM]+\b' roman_numerals = re.findall(roman_pattern, text) if roman_numerals: issues.append({ 'type': 'roman_numerals', 'description': 'Roman numerals that may be mispronounced', 'instances': roman_numerals, 'suggestion': 'Replace Roman numerals with Arabic numbers' }) # Detection of abbreviations by language abbreviation_patterns = { 'fr': [r'\bM\.\s', r'\bMme\.\s', r'\bMlle\.\s', r'\bDr\.\s', r'\bProf\.\s', r'\betc\.\s', r'\bex\.\s'], 'en': [r'\bMr\.\s', r'\bMrs\.\s', r'\bDr\.\s', r'\bProf\.\s', r'\betc\.\s', r'\be\.g\.\s', r'\bi\.e\.\s'], 'es': [r'\bSr\.\s', r'\bSra\.\s', r'\bDr\.\s', r'\betc\.\s'], 'default': [r'\b[A-Z]\.\s', r'\b[A-Z][a-z]+\.\s'] } patterns = abbreviation_patterns.get(language_code, abbreviation_patterns['default']) found_abbrevs = [] for pattern in patterns: matches = re.findall(pattern, text) found_abbrevs.extend(matches) if found_abbrevs: issues.append({ 'type': 'abbreviations', 'description': 'Detected abbreviations that may be mispronounced', 'instances': found_abbrevs, 'suggestion': 'Write abbreviations in full' }) # Detection of repeated punctuation repeated_punct = re.findall(r'([!?.,;:]{2,})', text) if repeated_punct: issues.append({ 'type': 'repeated_punct', 'description': 'Repeated punctuation that may cause incorrect pauses', 'instances': repeated_punct, 'suggestion': 'Simplify punctuation (use only one character)' }) # Detection of missing spaces around punctuation, excluding decimal numbers missing_spaces = [] # Specific patterns to look for patterns = [ r'[a-zA-ZÀ-ÿ][,.;:!?][a-zA-ZÀ-ÿ]' # letter+punctuation+letter ] # In English, exclude contractions with apostrophes (I'm, don't, isn't, etc.) if language_code != 'en': for pattern in patterns: matches = re.findall(pattern, text) if matches: missing_spaces.extend(matches) if missing_spaces: issues.append({ 'type': 'missing_spaces', 'description': 'Punctuation without spaces that may affect pronunciation', 'instances': missing_spaces, 'suggestion': 'Add appropriate spaces around punctuation (except for decimal numbers)' }) # Detection of language-specific issues if language_code == 'fr': # Poorly formatted ordinal numbers in French ordinals = re.findall(r'\b\d+(eme|ème|er|ere|ère)\b', text) if ordinals: issues.append({ 'type': 'fr_ordinals', 'description': 'Ordinal numbers that may be mispronounced', 'instances': ordinals, 'suggestion': 'Write ordinals in letters (premier, deuxième, etc.)' }) elif language_code == 'en': # English-specific issues dates = re.findall(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', text) if dates: issues.append({ 'type': 'en_dates', 'description': 'Dates in numeric format that may be misinterpreted', 'instances': dates, 'suggestion': 'Write dates in full (e.g., January 1st, 2022)' }) return { 'issues': issues, 'has_issues': len(issues) > 0, 'normalized_text': normalized_text } # Add a function to convert numbers to text def number_to_text_fr(number_str): """ Converts a number (integer or decimal) to French text. Args: number_str (str): The number to convert to text format Returns: str: The number written out in words """ parts = number_str.replace(',', '.').split('.') # Function to convert an integer to text def int_to_text(n): if n == '0': return 'zéro' units = ['', 'un', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf'] teens = ['dix', 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dix-sept', 'dix-huit', 'dix-neuf'] tens = ['', 'dix', 'vingt', 'trente', 'quarante', 'cinquante', 'soixante', 'soixante', 'quatre-vingt', 'quatre-vingt'] n = int(n) if n < 10: return units[n] elif n < 20: return teens[n-10] elif n < 70: div, mod = divmod(n, 10) return tens[div] + ('-et-un' if mod == 1 else ('-' + units[mod] if mod else '')) elif n < 80: div, mod = divmod(n, 10) return tens[div] + ('-' + teens[mod-10] if mod else '') elif n < 90: div, mod = divmod(n, 10) return tens[div] + (('-' + units[mod]) if mod else 's') elif n < 100: div, mod = divmod(n, 10) return tens[div] + ('-' + teens[mod-10] if mod else 's') else: if n < 200: return 'cent' + (' ' + int_to_text(n % 100) if n % 100 else '') else: div, mod = divmod(n, 100) return int_to_text(div) + ' cent' + ('s' if div > 1 and mod == 0 else '') + (' ' + int_to_text(mod) if mod else '') # Process the integer part integer_part = int_to_text(parts[0]) # If there's a decimal part if len(parts) > 1 and parts[1]: # If the decimal part is 1 or 2 digits decimal_part = parts[1] if len(decimal_part) <= 2: decimal_text = int_to_text(decimal_part) # For 01, 02, etc. we say "un", "deux", etc. rather than "un", "deux" if len(decimal_part) == 2 and decimal_part[0] == '0': decimal_text = int_to_text(decimal_part[1]) return f"{integer_part} virgule {decimal_text}" else: # For more than 2 digits, we pronounce each digit decimal_text = ' '.join(int_to_text(d) for d in decimal_part) return f"{integer_part} virgule {decimal_text}" return integer_part def preprocess_text(text, language_code, apply_replacements=True): """Preprocess and clean text for optimal voice synthesis results. This function automatically fixes common text issues like special characters, numbers, URLs, and language-specific elements to improve speech quality. Args: text: The text to preprocess for voice synthesis language_code: Language code (en, fr, es, de, it, pt, pl, tr, ru, nl, cs, ar, zh, hu, ko, ja, hi) apply_replacements: If True, applies automatic character replacements for better pronunciation Returns: The preprocessed text ready for high-quality voice synthesis """ # Unicode normalization text = normalize('NFC', text) if apply_replacements: # Détection et suppression des emojis et caractères spéciaux Unicode import re # Regex pour détecter les emojis et symboles Unicode emoji_pattern = re.compile( "[" "\U0001F600-\U0001F64F" # emoticons "\U0001F300-\U0001F5FF" # symbols & pictographs "\U0001F680-\U0001F6FF" # transport & map symbols "\U0001F700-\U0001F77F" # alchemical symbols "\U0001F780-\U0001F7FF" # Geometric Shapes "\U0001F800-\U0001F8FF" # Supplemental Arrows-C "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs "\U0001FA00-\U0001FA6F" # Chess Symbols "\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A "\U00002702-\U000027B0" # Dingbats "\U000024C2-\U0001F251" "]+", flags=re.UNICODE ) # Remplacer les emojis par un espace text = emoji_pattern.sub(' ', text) # Apply global replacement rules for char, replacements in REPLACEMENT_RULES.get('global', {}).items(): if char in text: # Use language-specific rule if available, otherwise default rule replacement = replacements.get(language_code, replacements.get('default', char)) text = text.replace(char, replacement) # Transform URLs and emails text = re.sub(r'https?://\S+|www\.\S+', ' URL link ', text) text = re.sub(r'\S+@\S+\.\S+', ' email address ', text) # Process quotes (removal or replacement) # Straight quotes " and ' text = text.replace('"', '') text = text.replace("'", '') # French quotes « and » text = text.replace('«', '') text = text.replace('»', '') # Smart typographic quotes (curly quotes) text = text.replace('"', '') # opening quote text = text.replace('"', '') # closing quote text = text.replace(''', '') # opening apostrophe text = text.replace(''', '') # closing apostrophe # Replace Roman numerals with their equivalent (if needed) if language_code in ['fr', 'en', 'es', 'it', 'pt']: roman_numerals = { 'I': '1', 'II': '2', 'III': '3', 'IV': '4', 'V': '5', 'VI': '6', 'VII': '7', 'VIII': '8', 'IX': '9', 'X': '10', 'XI': '11', 'XII': '12', 'XIII': '13', 'XIV': '14', 'XV': '15', 'XVI': '16', 'XVII': '17', 'XVIII': '18', 'XIX': '19', 'XX': '20' } # Exception for the personal pronoun "I" in English if language_code == 'en': # Use a regex that only detects true Roman numerals # and not the personal pronoun "I" in English for roman, arabic in roman_numerals.items(): if roman == 'I': # For "I" in English, check that it's not alone or between spaces # A true Roman numeral I will typically be followed by a period or # in a numeric context text = re.sub(r'\b(I)\b(?!\'m|\'ve|\'ll|\'d|\.)', roman, text) # Preserve "I" pronoun text = re.sub(r'\b(I)\.', arabic + '.', text) # I. => 1. else: # For other Roman numerals, standard behavior text = re.sub(fr'\b{roman}\b', arabic, text) else: # For other languages, replace all Roman numerals for roman, arabic in roman_numerals.items(): text = re.sub(fr'\b{roman}\b', arabic, text) # Language-specific processing for French if language_code == 'fr': # Replace common numbers text = re.sub(r'\b1er\b', 'premier', text) text = re.sub(r'\b1ère\b', 'première', text) text = re.sub(r'\b(\d+)(ème)\b', r'\1 ième', text) # Improved processing of decimal numbers and percentages in French # Search for patterns like "2,95 %" or "2,95%" def replace_decimal_percent(match): num = match.group(1) return number_to_text_fr(num) + " pour cent" # Search for decimal numbers followed by % (with or without space) text = re.sub(r'(\d+,\d+)\s*%', replace_decimal_percent, text) # Process decimal numbers without percentage def replace_decimal(match): return number_to_text_fr(match.group(0)) # Search for decimal numbers (with comma) text = re.sub(r'\b\d+,\d+\b', replace_decimal, text) # Process simple percentages text = re.sub(r'(\d+)\s*%', lambda m: number_to_text_fr(m.group(1)) + " pour cent", text) # Apply French typographical rules for punctuation: # - No space before: . , ... ) ] } # - Space after: . , ... ) ] } # - Space before and after: : ; ! ? « » # First, normalize by removing all spaces around punctuation text = re.sub(r'\s*([.,;:!?\[\]\(\)\{\}])\s*', r'\1', text) # Then, add spaces according to French rules # Simple punctuation with space after only text = re.sub(r'([.,)])', r'\1 ', text) # Punctuation with space before and after text = re.sub(r'([;:!?])', r' \1 ', text) # Special case for French quotes text = re.sub(r'«', r'« ', text) text = re.sub(r'»', r' »', text) # Language-specific processing for English elif language_code == 'en': # Replace ordinals text = re.sub(r'\b1st\b', 'first', text) text = re.sub(r'\b2nd\b', 'second', text) text = re.sub(r'\b3rd\b', 'third', text) text = re.sub(r'\b(\d+)th\b', r'\1th', text) # Process percentages in English (decimals with point) text = re.sub(r'(\d+\.\d+)%', r'\1 percent', text) text = re.sub(r'(\d+)%', r'\1 percent', text) # English typographical rules: no space before punctuation, space after text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) # For other languages, general rule: no space before, space after punctuation else: text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) # Clean up multiple spaces text = re.sub(r'\s+', ' ', text).strip() return text def format_issues_for_display(analysis_result, language_code, tokenizer_analysis=None): """ Formats detected issues for display in the interface. Args: analysis_result (dict): Result of the text analysis language_code (str): Language code tokenizer_analysis (dict): Result of tokenizer analysis (optional) Returns: str: Formatted text for display """ if not analysis_result['has_issues'] and (tokenizer_analysis is None or not tokenizer_analysis['has_issues']): return "✅ No issues detected in the text." formatted_text = "⚠️ Potential issues detected:\n\n" # Format standard text analysis issues if analysis_result['has_issues']: formatted_text += "📊 Text analysis results:\n" for issue in analysis_result['issues']: formatted_text += f"- {issue['description']}:\n" formatted_text += f" • Detected: {', '.join(repr(i) for i in issue['instances'])}\n" formatted_text += f" • Suggestion: {issue['suggestion']}\n\n" # Format tokenizer analysis issues (if available) if tokenizer_analysis and tokenizer_analysis['has_issues']: formatted_text += "\n🔍 Tokenizer analysis results:\n" for issue in tokenizer_analysis['issues']: formatted_text += f"- {issue['description']}:\n" formatted_text += f" • Detected: {', '.join(repr(i) for i in issue['instances'])}\n" formatted_text += f" • Suggestion: {issue['suggestion']}\n\n" if 'cleaned_text' in tokenizer_analysis: formatted_text += "\n📝 Cleaned text by XTTS tokenizer:\n" formatted_text += f"{tokenizer_analysis['cleaned_text']}\n\n" formatted_text += "\nEnable text preprocessing to automatically fix some of these issues." return formatted_text repo_id = "XTTS-v2" # Télécharger le modèle seulement s'il n'existe pas déjà if not os.path.exists(repo_id) or not os.path.exists(os.path.join(repo_id, "config.json")): try: print("Téléchargement du modèle XTTS-v2...") snapshot_download( repo_id="coqui/XTTS-v2", local_dir=repo_id, allow_patterns=["*.safetensors", "*.wav", "*.json", "*.pth"] ) print("Modèle téléchargé avec succès!") except Exception as e: print(f"Erreur lors du téléchargement: {e}") print("Essai avec git clone...") try: import subprocess result = subprocess.run( ["git", "clone", "https://huggingface.co/coqui/XTTS-v2", repo_id], capture_output=True, text=True ) if result.returncode == 0: print("Modèle téléchargé avec git clone!") else: print(f"Erreur git clone: {result.stderr}") raise Exception("Impossible de télécharger le modèle") except Exception as git_error: print(f"Erreur git clone: {git_error}") raise Exception("Veuillez télécharger le modèle manuellement avec: git clone https://huggingface.co/coqui/XTTS-v2") else: print("Modèle XTTS-v2 déjà présent.") # Relative path management BASE_DIR = Path(os.path.dirname(os.path.abspath(__file__))) MODELS_DIR = repo_id # BASE_DIR / "XTTS-v2" REF_AUDIO_DIR = BASE_DIR / "ref_audio_files" OUTPUT_DIR = BASE_DIR / "outputs" TEMP_DIR = OUTPUT_DIR / "temp" # Create necessary folders REF_AUDIO_DIR.mkdir(exist_ok=True) OUTPUT_DIR.mkdir(exist_ok=True) TEMP_DIR.mkdir(exist_ok=True) # Languages supported by XTTS SUPPORTED_LANGUAGES = { "English": "en", "French": "fr", "Spanish": "es", "German": "de", "Italian": "it", "Portuguese": "pt", "Polish": "pl", "Turkish": "tr", "Russian": "ru", "Dutch": "nl", "Czech": "cs", "Arabic": "ar", "Chinese": "zh-cn", "Japanese": "ja", "Korean": "ko", "Hungarian": "hu", "Hindi": "hi" } print(f"Initializing model from: {MODELS_DIR}") # Clean temporary files def cleanup_temp_files(): """Cleans temporary files in the TEMP_DIR folder""" try: for file in TEMP_DIR.glob("*"): if file.is_file(): os.remove(file) except Exception as e: print(f"Error while cleaning temporary files: {e}") # Clean old generated MP3 files (optional) def cleanup_old_outputs(max_age_days=7): """Deletes MP3 files older than max_age_days in the OUTPUT_DIR folder""" try: now = time.time() for file in OUTPUT_DIR.glob("*.mp3"): if file.is_file(): # If the file is older than max_age_days if os.path.getmtime(file) < now - (max_age_days * 86400): os.remove(file) except Exception as e: print("error cleanup old outputs") # Import XTTS modules try: from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts except ImportError as e: print(f"TTS import error: {e}") print("Please install dependencies with: pip install coqui-tts") sys.exit(1) # Install language-specific dependencies def install_language_dependencies(): """Check and install required dependencies for Asian languages""" try: # For Chinese (zh-cn) try: import pypinyin except ImportError: subprocess.check_call([sys.executable, "-m", "pip", "install", "pypinyin"]) # For Japanese (ja) try: import cutlet # Test if fugashi and mecab are also installed try: import fugashi except ImportError: subprocess.check_call([sys.executable, "-m", "pip", "install", "fugashi", "mecab-python3", "unidic-lite"]) except ImportError: subprocess.check_call([sys.executable, "-m", "pip", "install", "cutlet", "fugashi", "mecab-python3", "unidic-lite"]) # For Korean (ko) try: import hangul_romanize except ImportError: subprocess.check_call([sys.executable, "-m", "pip", "install", "hangul-romanize"]) return True except Exception as e: return False # Model initialization and configuration try: # Try to install language dependencies install_language_dependencies() config = XttsConfig() config.load_json(str("XTTS-v2/config.json")) model = Xtts.init_from_config(config) # model.load_safetensors_checkpoint( # config, checkpoint_dir=MODELS_DIR, use_deepspeed=False #) model.load_checkpoint(config, checkpoint_dir=str(MODELS_DIR), eval=True) if torch.cuda.is_available(): model.cuda() print("Model loaded on GPU") else: print("GPU not available, using CPU") except Exception as e: print(f"Error loading model: {e}") print(f"Make sure the XTTS-v2 model is present in: {MODELS_DIR}") sys.exit(1) def remove_silence( audio_segment, silence_thresh=-45, min_silence_len=300, keep_silence=100 ): """ Optimisé: Coupe audio_segment autour des silences puis reconstruit l'audio en supprimant les silences. Ajuste silence_thresh et min_silence_len en fonction du niveau sonore de votre audio. """ # Vérifie que l'audio n'est pas trop court pour éviter les problèmes if len(audio_segment) < 1000: # moins d'une seconde return audio_segment # Première tentative avec les paramètres fournis chunks = split_on_silence( audio_segment, min_silence_len=min_silence_len, silence_thresh=silence_thresh, keep_silence=keep_silence ) # Si aucun segment n'est détecté ou peu de segments, ajuster les paramètres if not chunks or len(chunks) < 2: # Essayer avec des paramètres plus souples chunks = split_on_silence( audio_segment, min_silence_len=200, # Réduire pour détecter des silences plus courts silence_thresh=silence_thresh + 5, # Augmenter le seuil (moins négatif) pour détecter plus de silences keep_silence=keep_silence ) # Recombiner toutes les pièces non silencieuses if chunks: processed_audio = AudioSegment.empty() for chunk in chunks: processed_audio += chunk # Vérifier que l'audio n'a pas été trop raccourci length_ratio = len(processed_audio) / len(audio_segment) if length_ratio < 0.7: # Si plus de 30% a été supprimé # Garder une version moins agressive chunks = split_on_silence( audio_segment, min_silence_len=min_silence_len * 2, # Plus long, détecte moins de silences silence_thresh=silence_thresh - 5, # Plus strict (plus négatif) keep_silence=keep_silence * 2 # Garder plus de silence ) if chunks: processed_audio = AudioSegment.empty() for chunk in chunks: processed_audio += chunk else: return audio_segment return processed_audio else: # Si tout l'audio est considéré comme du silence, retourner l'original return audio_segment def chunk_sentence_by_words(sentence, max_length=200): """ Divise une phrase en sous-chunks (max. max_length caractères) sans couper au milieu d'un mot. Optimisé pour la performance. """ # Si la phrase est déjà suffisamment courte, la retourner directement if len(sentence) <= max_length: return [sentence] words = sentence.split() # division par mots sub_chunks = [] current_chunk = [] current_length = 0 for word in words: # Si ajouter ce mot dépasserait la longueur max, commencer un nouveau chunk word_len = len(word) + (1 if current_length > 0 else 0) # +1 pour l'espace if current_length + word_len > max_length: if current_chunk: # S'assurer qu'on a quelque chose à ajouter sub_chunks.append(" ".join(current_chunk)) current_chunk = [] current_length = 0 # Traiter les mots individuels qui sont plus longs que max_length if len(word) > max_length: sub_chunks.append(word) continue # Ajouter le mot au chunk courant current_chunk.append(word) current_length += word_len # Ajouter le dernier chunk s'il en reste if current_chunk: sub_chunks.append(" ".join(current_chunk)) return sub_chunks def split_text(text, max_length=150): """ - Divise 'text' en phrases (via sent_tokenize). - Si une phrase dépasse max_length, la divise mot par mot en utilisant chunk_sentence_by_words. - Retourne une liste de chunks, chacun ≤ max_length caractères. Optimisé pour la performance. """ # Vérifier que le texte n'est pas vide if not text.strip(): return [] # Division en phrases avec gestion d'erreur améliorée try: raw_sentences = sent_tokenize(text) if not raw_sentences: raw_sentences = [text] except Exception as e: # En cas d'erreur, utiliser une simple division par points raw_sentences = [s.strip() + '.' for s in text.split('.') if s.strip()] if not raw_sentences: raw_sentences = [text] # Initialiser la liste finale de chunks final_chunks = [] # Traiter chaque phrase for sentence in raw_sentences: sentence = sentence.strip() if not sentence: continue # Si la phrase entière est courte, l'ajouter directement if len(sentence) <= max_length: final_chunks.append(sentence) else: # Sinon, la diviser en sous-chunks sub_chunks = chunk_sentence_by_words(sentence, max_length) final_chunks.extend(sub_chunks) # S'assurer qu'on a des chunks if not final_chunks: for i in range(0, len(text), max_length): chunk = text[i:i+max_length] if chunk.strip(): # Ne pas ajouter de segments vides final_chunks.append(chunk) return final_chunks def check_language_dependencies(language): """ Vérifie les dépendances nécessaires pour une langue donnée. Cette fonction s'exécute sur CPU. Args: language (str): Code de langue à vérifier Returns: tuple: (None, None) si tout est ok, ou (None, message_erreur) si problème """ # Dépendances spécifiques par langue language_dependencies = { "zh-cn": "pypinyin", "ja": "cutlet,fugashi,unidic-lite", "ko": "hangul-romanize", } if language in language_dependencies: try: # Essayer d'importer dynamiquement la dépendance if language == "zh-cn": import importlib importlib.import_module("pypinyin") elif language == "ja": import importlib importlib.import_module("cutlet") # Vérifier les dépendances supplémentaires pour le japonais try: importlib.import_module("fugashi") # Vérifier si unidic-lite est installé try: import unidic_lite except ImportError: raise ImportError("Japanese requires: unidic-lite") except ImportError: raise ImportError("Japanese requires: fugashi and unidic-lite") elif language == "ko": import importlib importlib.import_module("hangul_romanize") except ImportError as e: dependency = language_dependencies[language] language_name = { "zh-cn": "Chinese", "ja": "Japanese", "ko": "Korean" }[language] # Message personnalisé pour les dépendances japonaises if language == "ja" and "fugashi" in str(e): install_command = "pip install fugashi mecab-python3 unidic-lite" error_message = f""" Error: Missing dependencies for {language_name} language. Please run the following command to install the required packages: {install_command} Then restart the application. """ else: install_command = f"pip install {dependency}" error_message = f""" Error: Missing dependency for {language_name} language. Please run the following command to install the required package: {install_command} Then restart the application. """ return None, error_message return None, None @spaces.GPU() def synthesize_speech( text, language, temperature, speed, reference_audio, do_sample=True, repetition_penalty=1.0, length_penalty=1.0, gpt_cond_len=30, top_k=50, top_p=0.85, remove_silence_enabled=True, silence_threshold=-45, min_silence_len=300, keep_silence=100, text_splitting_method="Native XTTS splitting", max_chars_per_segment=250, enable_preprocessing=True ): """Generate speech from text by orchestrating preprocessing, synthesis, and post-processing. This function acts as the main pipeline for TTS generation. It takes raw text and parameters, handles dependencies, preprocesses text, generates a raw audio waveform using the XTTS model, and then post-processes the audio (normalization, silence removal) to produce a final MP3 file. Args: text (str): The text to convert to speech. language (str): Language code for synthesis (e.g., 'en', 'fr'). temperature (float): Controls randomness in generation (0.1-1.5, recommended: 0.75). speed (float): Speech speed multiplier (0.5-2.0, 1.0 = normal speed). reference_audio (str): File path or URL to reference audio for voice cloning. do_sample (bool): Enable sampling for more natural speech variation. repetition_penalty (float): Penalty for repetitive speech (1.0-5.0, recommended: 5.0). length_penalty (float): Penalty affecting speech length (1.0-2.0, recommended: 1.0). gpt_cond_len (int): Conditioning length for GPT model (10-50, recommended: 30). top_k (int): Top-K sampling parameter (0-50, 0 = disabled). top_p (float): Top-P sampling parameter (0.0-1.0, 0 = disabled). remove_silence_enabled (bool): Remove silent parts from generated audio. silence_threshold (int): dB threshold for silence detection (-60 to -20). min_silence_len (int): Minimum silence length in ms to detect (300-1000). keep_silence (int): Amount of silence to keep in ms (100-500). text_splitting_method (str): Method for splitting long text. max_chars_per_segment (int): Maximum characters per segment for custom splitting. enable_preprocessing (bool): Automatically preprocess text for better pronunciation. Returns: tuple: (audio_file_path, error_message, preprocessed_text) - audio_file_path (str): Path to the generated MP3 audio file, or None on error. - error_message (str): A description of the error if one occurred, otherwise None. - preprocessed_text (str): The text after preprocessing has been applied. """ # Part 1: Validation and Parameter Setup if not text.strip(): return None, "Error: Text cannot be empty", text _, error_message = check_language_dependencies(language) if error_message: return None, error_message, text if top_k == 0: top_k = None if top_p == 0: top_p = None if temperature <= 0: temperature = 0.75 if repetition_penalty <= 0: repetition_penalty = 5.0 if length_penalty <= 0: length_penalty = 1.0 reference_audio_path = reference_audio # Part 2: Text Preprocessing preprocessed_text = text if enable_preprocessing: preprocessed_text = preprocess_text(text, language) print(f"Preprocessed text: {preprocessed_text}") # Part 3: Waveform Generation (Core Synthesis) try: if text_splitting_method == "Custom splitting": text_chunks = split_text(preprocessed_text, max_length=max_chars_per_segment) print(f"Text split into {len(text_chunks)} segments (max {max_chars_per_segment} characters per segment)") if not text_chunks: return None, "Error: The text could not be split into segments", preprocessed_text outputs_wav_list = [] for i, chunk in enumerate(text_chunks): print(f"Processing segment {i+1}/{len(text_chunks)}: {chunk}") chunk_output = model.synthesize( chunk, config, speaker_wav=reference_audio_path, language=language, temperature=temperature, do_sample=do_sample, speed=speed, enable_text_splitting=True, repetition_penalty=repetition_penalty, length_penalty=length_penalty, gpt_cond_len=gpt_cond_len, top_k=top_k, top_p=top_p ) outputs_wav_list.append(chunk_output["wav"]) if outputs_wav_list: outputs_wav = np.concatenate(outputs_wav_list) else: return None, "Error: No audio segment could be generated", preprocessed_text else: # Always enable native XTTS splitting by default for better AI agent compatibility use_native_splitting = True if text_splitting_method == "No splitting": use_native_splitting = False print("Native XTTS splitting disabled by user request") elif len(preprocessed_text) > 150: print("Long text detected: native XTTS splitting is enabled") use_native_splitting = True print(f"Generating with parameters: temperature={temperature}, do_sample={do_sample}, repetition_penalty={repetition_penalty}, length_penalty={length_penalty}, top_k={top_k}, top_p={top_p}, enable_text_splitting={use_native_splitting}") outputs = model.synthesize( preprocessed_text, config, speaker_wav=reference_audio_path, language=language, temperature=temperature, do_sample=do_sample, speed=speed, enable_text_splitting=use_native_splitting, repetition_penalty=repetition_penalty, length_penalty=length_penalty, gpt_cond_len=gpt_cond_len, top_k=top_k, top_p=top_p ) outputs_wav = outputs["wav"] except Exception as e: error_message = f"Error during audio generation: {str(e)}" print(error_message) error_str = str(e) if "Chinese requires: pypinyin" in error_str: error_message = "Error: Missing pypinyin package for Chinese language support.\n\nPlease run: pip install pypinyin" elif "No module named 'cutlet'" in error_str: error_message = "Error: Missing cutlet package for Japanese language support.\n\nPlease run: pip install cutlet" elif "Japanese requires: fugashi" in error_str: error_message = "Error: Missing fugashi package for Japanese language support.\n\nPlease run: pip install fugashi mecab-python3 unidic-lite" elif "Japanese requires: unidic-lite" in error_str: error_message = "Error: Missing unidic-lite package for Japanese language support.\n\nPlease run: pip install unidic-lite" elif "Failed initializing MeCab" in error_str or "no such file or directory: /usr/local/etc/mecabrc" in error_str: error_message = """Error: MeCab initialization failed for Japanese language support. Please run: pip install fugashi mecab-python3 unidic-lite If the error persists, you may need to install MeCab dictionaries: - For Ubuntu/Debian: sudo apt-get install mecab mecab-ipadic - For macOS with Homebrew: brew install mecab mecab-ipadic """ elif "Korean requires: hangul_romanize" in error_str: error_message = "Error: Missing hangul-romanize package for Korean language support.\n\nPlease run: pip install hangul-romanize" return None, error_message, preprocessed_text # Part 4: Audio Post-Processing try: temp_audio_path = str(TEMP_DIR / f"temp_chunk_audio_{uuid.uuid4()}.wav") torchaudio.save(temp_audio_path, torch.tensor(outputs_wav).unsqueeze(0), 24000) audio_segment = AudioSegment.from_wav(temp_audio_path) # Normalisation du volume de manière moins agressive target_dbfs = -18.0 current_dbfs = audio_segment.dBFS if current_dbfs < -50: delta_db = -18.0 - current_dbfs delta_db = min(delta_db, 20.0) audio_segment = audio_segment.apply_gain(delta_db) else: delta_db = target_dbfs - current_dbfs audio_segment = audio_segment.apply_gain(delta_db) combined_audio = audio_segment # Suppression des silences si activée if remove_silence_enabled: padding = AudioSegment.silent(duration=500, frame_rate=combined_audio.frame_rate) padded_audio = padding + combined_audio + padding processed_audio = remove_silence( padded_audio, silence_thresh=silence_threshold, min_silence_len=min_silence_len, keep_silence=keep_silence ) if len(processed_audio) > len(combined_audio) + 900: trim_length = min(500, len(processed_audio) // 10) combined_audio = processed_audio[trim_length:-trim_length] else: combined_audio = processed_audio timestamp = time.strftime("%Y%m%d-%H%M%S") final_output_path = str(TEMP_DIR / f"temp_output_{timestamp}_{uuid.uuid4()}.mp3") combined_audio.export(final_output_path, format="mp3", bitrate="192k") try: os.remove(temp_audio_path) except: pass return final_output_path, None, preprocessed_text except Exception as e: error_message = f"Error during audio processing: {str(e)}" print(error_message) return None, error_message, preprocessed_text def download_audio_from_url(url): """Downloads an audio file from a URL and saves it to a temporary file.""" try: if not url.startswith(('http://', 'https://')): raise ValueError("URL must start with http:// or https://") response = requests.get(url, stream=True, timeout=20) # 20 seconds timeout response.raise_for_status() # Use a temporary file to store the audio with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: for chunk in response.iter_content(chunk_size=8192): temp_audio.write(chunk) print(f"Audio downloaded from {url} to {temp_audio.name}") return temp_audio.name except (requests.exceptions.RequestException, ValueError) as e: print(f"Failed to download audio from {url}: {e}") return None def voice_clone_synthesis( text: str, reference_audio_url: str = None, example_audio_name: str = None, language: str = "English", temperature: float = 0.75, speed: float = 1.0, do_sample: bool = True, repetition_penalty: float = 5.0, length_penalty: float = 1.0, gpt_cond_len: int = 30, top_k: int = 50, top_p: float = 0.85, remove_silence_enabled: bool = True, silence_threshold: int = -45, min_silence_len: int = 300, keep_silence: int = 100, text_splitting_method: str = "Native XTTS splitting", max_chars_per_segment: int = 250, enable_preprocessing: bool = False ): """ 🎤 Generates speech by cloning a voice from a reference audio URL. This tool takes text and a URL to a reference audio file, and synthesizes the text in the voice from the reference audio. It supports 17 languages and offers advanced control over the generation process. Args: text (str): The text to be synthesized. Required. reference_audio_url (str, optional): A public URL pointing to a reference audio file (WAV or MP3). Provide this OR example_audio_name, but not both. example_audio_name (str, optional): The name of a pre-defined example audio file. Valid choices: 'audio_1.wav', 'audio_2.wav', 'audio_3.wav', 'audio_4.wav', 'audio_5.wav', 'guzel_ses.wav', 'guzel_ses_rapide.wav'. Provide this OR reference_audio_url, but not both. language (str): The language of the text. Defaults to "English". Supported languages: English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Japanese, Korean, Hungarian, Hindi. temperature (float): Controls the randomness of the output. Higher values make it more random. Range: 0.1-1.5. Default: 0.75. Recommended: 0.75 for balanced output. speed (float): The speed of the generated speech. Range: 0.5-2.0. Default: 1.0. Example: 0.8 = slower, 1.2 = faster. do_sample (bool): Whether to use sampling for generation. Recommended: True. Default: True. repetition_penalty (float): Penalty for repeating words or phrases. IMPORTANT: Must be > 1.0. Range: 1.0-5.0. Default: 5.0. Higher values reduce repetition. AI agents should use values like 1.1, 1.5, 2.0, 3.0, 4.0, 5.0. length_penalty (float): Penalty for sentence length. IMPORTANT: Must be > 1.0. Range: 1.0-2.0. Default: 1.0. Higher values encourage shorter sentences. gpt_cond_len (int): Conditioning length for the GPT model. Range: 10-50. Default: 30. Higher values use more context. top_k (int): Top-K sampling parameter. 0 to disable top-k. Range: 0-50. Default: 50. Lower values make output more focused. top_p (float): Top-P (nucleus) sampling parameter. 0.0 to disable top-p. Range: 0.0-1.0. Default: 0.85. Lower values make output more focused. remove_silence_enabled (bool): Enable/disable automatic silence removal. Default: True. silence_threshold (int): Silence threshold in dB for silence detection. Range: -60 to -20. Default: -45. More negative = more sensitive to silence. min_silence_len (int): Minimum length of silence to be removed in milliseconds. Range: 300-1000. Default: 300. keep_silence (int): Amount of silence to keep at the beginning/end in milliseconds. Range: 100-500. Default: 100. text_splitting_method (str): Method for splitting text. Valid choices: 'Native XTTS splitting', 'Custom splitting', 'No splitting'. Default: 'Native XTTS splitting'. Recommended for most use cases. max_chars_per_segment (int): Max characters per segment when using 'Custom splitting'. Range: 50-400. Default: 250. Only relevant when text_splitting_method = 'Custom splitting'. enable_preprocessing (bool): Enable automatic text preprocessing to clean problematic characters. Default: False. Recommended: True for better pronunciation. Returns: str: A URL to the generated MP3 audio file. Examples: Basic usage with example audio: voice_clone_synthesis( text="Hello world!", example_audio_name="audio_1.wav", language="English" ) Advanced usage with custom parameters: voice_clone_synthesis( text="Bonjour le monde!", example_audio_name="audio_2.wav", language="French", temperature=0.8, speed=1.1, repetition_penalty=2.0, # Note: > 1.0 required length_penalty=1.2, # Note: > 1.0 required enable_preprocessing=True ) Raises: gr.Error: If parameters are out of range or invalid combinations are used. """ # Validate and convert parameter types early for better AI agent feedback temperature = float(temperature) speed = float(speed) repetition_penalty = float(repetition_penalty) length_penalty = float(length_penalty) gpt_cond_len = int(gpt_cond_len) top_k = int(top_k) top_p = float(top_p) silence_threshold = int(silence_threshold) min_silence_len = int(min_silence_len) keep_silence = int(keep_silence) max_chars_per_segment = int(max_chars_per_segment) if not (0.1 <= temperature <= 1.5): raise gr.Error(f"Temperature must be between 0.1 and 1.5, got {temperature}") if not (0.5 <= speed <= 2.0): raise gr.Error(f"Speed must be between 0.5 and 2.0, got {speed}") if not (1.0 <= repetition_penalty <= 5.0): raise gr.Error(f"Repetition penalty must be between 1.0 and 5.0, got {repetition_penalty}") if not (1.0 <= length_penalty <= 2.0): raise gr.Error(f"Length penalty must be between 1.0 and 2.0, got {length_penalty}") if not (10 <= gpt_cond_len <= 50): raise gr.Error(f"GPT conditioning length must be between 10 and 50, got {gpt_cond_len}") if not (0 <= top_k <= 50): raise gr.Error(f"Top-K must be between 0 and 50, got {top_k}") if not (0.0 <= top_p <= 1.0): raise gr.Error(f"Top-P must be between 0.0 and 1.0, got {top_p}") if not (-60 <= silence_threshold <= -20): raise gr.Error(f"Silence threshold must be between -60 and -20 dB, got {silence_threshold}") if not (300 <= min_silence_len <= 1000): raise gr.Error(f"Minimum silence length must be between 300 and 1000 ms, got {min_silence_len}") if not (100 <= keep_silence <= 500): raise gr.Error(f"Keep silence must be between 100 and 500 ms, got {keep_silence}") if not (50 <= max_chars_per_segment <= 400): raise gr.Error(f"Max characters per segment must be between 50 and 400, got {max_chars_per_segment}") valid_splitting_methods = ["Native XTTS splitting", "Custom splitting", "No splitting"] if text_splitting_method not in valid_splitting_methods: raise gr.Error(f"Text splitting method must be one of {valid_splitting_methods}, got '{text_splitting_method}'") valid_example_audios = ["audio_1.wav", "audio_2.wav", "audio_3.wav", "audio_4.wav", "audio_5.wav", "guzel_ses.wav", "guzel_ses_rapide.wav"] if example_audio_name and example_audio_name not in valid_example_audios: raise gr.Error(f"Example audio name must be one of {valid_example_audios}, got '{example_audio_name}'") reference_audio_path = None downloaded_path = None # To keep track of downloaded file for cleanup # Ensure only one reference type is provided if reference_audio_url and example_audio_name: raise gr.Error("Please provide either 'reference_audio_url' or 'example_audio_name', but not both.") if not reference_audio_url and not example_audio_name: raise gr.Error("You must provide either 'reference_audio_url' or 'example_audio_name'.") # Use the example audio if provided if example_audio_name: if example_audio_name not in file_path_mapping: available_files = ", ".join(files_display) raise gr.Error(f"Invalid example audio name. Available files are: {available_files}") reference_audio_path = file_path_mapping[example_audio_name] print(f"Using example audio: {reference_audio_path}") # Otherwise, download from URL if reference_audio_url: print(f"Downloading reference audio from: {reference_audio_url}") downloaded_path = download_audio_from_url(reference_audio_url) if not downloaded_path: raise gr.Error("Failed to download or process the reference audio from the provided URL.") reference_audio_path = downloaded_path # Validate the selected audio file is_valid, error_message = validate_audio_file(reference_audio_path) if not is_valid: if downloaded_path and os.path.exists(downloaded_path): os.remove(downloaded_path) raise gr.Error(error_message) language_code = SUPPORTED_LANGUAGES.get(language) if not language_code: if downloaded_path and os.path.exists(downloaded_path): os.remove(downloaded_path) raise gr.Error(f"Language '{language}' is not supported.") audio_path, error, _ = synthesize_speech( text=text, language=language_code, temperature=temperature, speed=speed, reference_audio=reference_audio_path, do_sample=do_sample, repetition_penalty=repetition_penalty, length_penalty=length_penalty, gpt_cond_len=gpt_cond_len, top_k=top_k, top_p=top_p, remove_silence_enabled=remove_silence_enabled, silence_threshold=silence_threshold, min_silence_len=min_silence_len, keep_silence=keep_silence, text_splitting_method=text_splitting_method, max_chars_per_segment=max_chars_per_segment, enable_preprocessing=enable_preprocessing ) # Clean up downloaded file if it exists if downloaded_path and os.path.exists(downloaded_path): os.remove(downloaded_path) if error: raise gr.Error(error) return audio_path def analyze_text_for_speech(text: str, language: str): """ 📊 Analyzes text for potential pronunciation and synthesis issues. This tool examines text for elements that could be mispronounced by the TTS model, such as special characters, numbers, URLs, and language-specific patterns. It provides a structured report of potential issues. Args: text (str): The text to analyze. Required. language (str): The language of the text. Required. Supported languages: English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Japanese, Korean, Hungarian, Hindi. Note: Use exact language names (case-sensitive). Returns: dict: A dictionary containing the analysis results with these keys: - standard_analysis_issues: List of detected issues with descriptions and suggestions - has_issues: Boolean indicating if any issues were found - xtts_cleaned_text: Preprocessed version of the text ready for synthesis Example: analyze_text_for_speech( text="Hello! This costs $15.99 & includes free shipping.", language="English" ) Raises: gr.Error: If the language is not supported. """ language_code = SUPPORTED_LANGUAGES.get(language) if not language_code: raise gr.Error(f"Language '{language}' is not supported.") standard_analysis = analyze_text(text, language_code) # tokenizer_analysis = analyze_with_tokenizer(text, language_code) combined_issues = { "standard_analysis_issues": standard_analysis.get('issues', []), # "tokenizer_analysis_issues": tokenizer_analysis.get('issues', []), "has_issues": standard_analysis.get('has_issues', False), # or tokenizer_analysis.get('has_issues', False), "xtts_cleaned_text": preprocess_text(text, language_code) # tokenizer_analysis.get('cleaned_text', text) } return combined_issues def preprocess_text_for_speech(text: str, language: str): """ 🔧 Preprocesses and cleans text for optimal speech synthesis. This tool applies a series of cleaning and normalization rules to the input text to improve its compatibility with the TTS model. This includes handling numbers, special characters, URLs, and applying language-specific typographical rules. Args: text (str): The text to preprocess. Required. language (str): The language of the text. Required. Supported languages: English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Japanese, Korean, Hungarian, Hindi. Note: Use exact language names (case-sensitive). Returns: str: The cleaned and preprocessed text ready for speech synthesis. Example: preprocess_text_for_speech( text="Visit https://example.com & pay $25.50!", language="English" ) # Returns: "Visit example.com and pay twenty-five dollars and fifty cents!" Raises: gr.Error: If the language is not supported. """ language_code = SUPPORTED_LANGUAGES.get(language) if not language_code: raise gr.Error(f"Language '{language}' is not supported.") return preprocess_text(text, language_code, apply_replacements=True) # Example texts for different languages EXAMPLE_TEXTS = { "fr": "Bonjour, je suis une voix générée par intelligence artificielle. Comment puis-je vous aider aujourd'hui?", "en": "Hello, I am a voice generated by artificial intelligence. How may I assist you today?", "es": "Hola, soy una voz generada por inteligencia artificial. ¿Cómo puedo ayudarte hoy?", "de": "Hallo, ich bin eine von künstlicher Intelligenz generierte Stimme. Wie kann ich Ihnen heute helfen?", "it": "Ciao, sono una voce generata dall'intelligenza artificiale. Come posso aiutarti oggi?", "pt": "Olá, sou uma voz gerada por inteligência artificial. Como posso ajudá-lo hoje?", "ar": "مرحبا، أنا صوت تم إنشاؤه بواسطة الذكاء الاصطناعي. كيف يمكنني مساعدتك اليوم؟", "zh-cn": "你好,我是由人工智能生成的声音。今天我能为您提供什么帮助?", "ja": "こんにちは、私は人工知能によって生成された音声です。今日はどのようにお手伝いできますか?", "ko": "안녕하세요, 저는 인공지능으로 생성된 목소리입니다. 오늘 어떻게 도와드릴까요?", "ru": "Здравствуйте, я голос, сгенерированный искусственным интеллектом. Чем я могу вам помочь сегодня?", "nl": "Hallo, ik ben een stem gegenereerd door kunstmatige intelligentie. Hoe kan ik u vandaag helpen?", "cs": "Dobrý den, jsem hlas vytvořený umělou inteligencí. Jak vám mohu dnes pomoci?", "pl": "Dzień dobry, jestem głosem wygenerowanym przez sztuczną inteligencję. Jak mogę ci dziś pomóc?", "tr": "Merhaba, ben yapay zeka tarafından oluşturulan bir sesim. Bugün size nasıl yardımcı olabilirim?", "hu": "Üdvözlöm, én egy mesterséges intelligencia által generált hang vagyok. Hogyan segíthetek ma?", "hi": "नमस्ते, मैं कृत्रिम बुद्धिमत्ता द्वारा उत्पन्न एक आवाज हूं। मैं आज आपकी कैसे मदद कर सकता हूं?" } # Function to analyze text with the XTTS tokenizer def analyze_with_tokenizer(text, language_code): """ Analyzes text using the XTTS model's tokenizer to detect parts that may be problematic for pronunciation. Args: text (str): The text to analyze language_code (str): Language code (fr, en, etc.) Returns: dict: A dictionary containing detected issues and suggestions """ import torch from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners issues = [] original_text = text try: # 1. Run the same preprocessing as the XTTS model uses internally cleaned_text = text print(f"Using XTTS cleaners for language: {language_code}") # The multilingual_cleaners object is a dictionary mapping language codes to cleaner functions. if language_code in multilingual_cleaners: cleaner_fn = multilingual_cleaners[language_code] cleaned_text = cleaner_fn(text) else: # If no specific cleaner is available, we just use the original text. # The TTS model will use its default basic cleaners internally. print(f"No specific cleaner for language {language_code}, using original text for analysis.") cleaned_text = text # 2. Tokenize the text as XTTS would # Compare the original and cleaned text to detect changes if original_text != cleaned_text: # Find the parts that have been modified import difflib # Create an object to compare the two texts differ = difflib.Differ() diff = list(differ.compare(original_text.split(), cleaned_text.split())) # Find the words that have been removed or changed modified_words = [] for d in diff: if d.startswith('- '): word = d[2:] if len(word) > 1: # Ignore individual characters modified_words.append(word) if modified_words: issues.append({ 'type': 'tokenizer_changes', 'description': 'Words that might be mispronounced', 'instances': modified_words, 'suggestion': 'Consider reformulating these parts or using automatic preprocessing' }) # 3. Check for words out of vocabulary (OOV) using the XTTS tokenizer # This part would require accessing the tokenizer's vocabulary, # which might not be directly accessible. # 4. Check for rare words that might be mispronounced words = text.split() long_words = [w for w in words if len(w) > 12] # Extremely long words if long_words: issues.append({ 'type': 'long_words', 'description': 'Extremely long words that might be mispronounced', 'instances': long_words, 'suggestion': 'Check if these words are pronounced correctly, try splitting them or reformulating' }) # 5. Check for special characters that are preserved after cleaning import re special_chars = re.findall(r'[^a-zA-Z0-9\s.,;:!?\'"-]', cleaned_text) if special_chars: unique_special_chars = list(set(special_chars)) issues.append({ 'type': 'special_chars_preserved', 'description': 'Special characters preserved by the tokenizer', 'instances': unique_special_chars, 'suggestion': 'These characters might cause pronunciation issues' }) return { 'issues': issues, 'has_issues': len(issues) > 0, 'cleaned_text': cleaned_text } except Exception as e: print(f"Error in tokenizer analysis: {e}") return { 'issues': [{ 'type': 'analysis_error', 'description': 'Error during analysis with the tokenizer', 'instances': [str(e)], 'suggestion': 'Technical error, please try again' }], 'has_issues': True, 'cleaned_text': text } # Function to combine both analyses def combined_analysis(text, language): """Perform comprehensive text analysis for optimal voice synthesis quality. This function combines standard text analysis with XTTS tokenizer analysis to detect and report all potential issues that might affect speech synthesis. Args: text: The text to analyze for speech synthesis compatibility language: Language name (English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Hungarian, Korean, Japanese, Hindi) Returns: A tuple containing detailed analysis report and cleaned text ready for synthesis """ language_code = SUPPORTED_LANGUAGES[language] # Run standard analysis standard_analysis = analyze_text(text, language_code) # Run analysis with tokenizer tokenizer_analysis = analyze_with_tokenizer(text, language_code) # Combine results display_text = format_issues_for_display(standard_analysis, language_code, tokenizer_analysis) # Get the preprocessed text (prefer the result from the tokenizer if it exists) cleaned_text = tokenizer_analysis.get('cleaned_text', "") if not cleaned_text or cleaned_text == text: cleaned_text = preprocess_text(text, language_code) if text else "" return display_text, cleaned_text def cleanup_old_files(max_age_minutes=60): """ Optimized: deletes temporary files older than max_age_minutes. This function can be called regularly to prevent accumulation of files. """ try: now = time.time() count_removed = 0 # Clean temporary files for file in TEMP_DIR.glob("*"): if file.is_file(): file_age_minutes = (now - os.path.getmtime(file)) / 60 if file_age_minutes > max_age_minutes: os.remove(file) count_removed += 1 # Clean old output files for file in OUTPUT_DIR.glob("*.mp3"): if file.is_file(): file_age_days = (now - os.path.getmtime(file)) / (24 * 60 * 60) if file_age_days > 7: # Keep one week os.remove(file) count_removed += 1 return count_removed except Exception as e: return 0 # Create interface with Gradio Blocks with gr.Blocks(theme=gr.themes.Ocean(), css=""" .gradio-container { max-width: 1280px !important; margin: auto !important; } #header { display: flex; justify-content: center; align-items: center; padding: 10px 0; } """) as interface: with gr.Row(elem_id="header"): gr.Markdown( """
Bring any voice to life from a 3-second audio sample.