Spaces:

Nishur
/

video_translator

Sleeping

App Files Files Community

Nishur commited on Apr 12

Commit

45248e1

verified ·

1 Parent(s): af655e7

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -25

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 from TTS.api import TTS
 from deep_translator import GoogleTranslator
 import pysrt
-import whisper  # Free speech-to-text
 import webvtt
 import shutil
 import time
@@ -37,20 +37,38 @@ SUBTITLE_STYLES = {
     "Black Background": "background-color: black; padding: 5px;"
 }
-# Create output directory
 OUTPUT_DIR = "outputs"
 os.makedirs(OUTPUT_DIR, exist_ok=True)
-# Initialize TTS
 device = "cuda" if torch.cuda.is_available() else "cpu"
-tts_models = {
-    "en": TTS("tts_models/en/ljspeech/tacotron2-DDC").to(device),
-    "es": TTS("tts_models/es/css10/vits").to(device),
-    "fr": TTS("tts_models/fr/css10/vits").to(device),
-    "de": TTS("tts_models/de/thorsten/tacotron2-DDC").to(device),
-    "ja": TTS("tts_models/ja/kokoro/tacotron2-DDC").to(device),
-    "hi": TTS("tts_models/hi/kb/tacotron2-DDC").to(device)
-}
 # Initialize Whisper (load when needed)
 whisper_model = None
@@ -69,7 +87,7 @@ def extract_audio(video_path: str) -> str:
         '-acodec', 'pcm_s16le', '-ar', '16000',
         '-ac', '1', '-y', audio_path
     ]
-    subprocess.run(cmd, check=True)
     return audio_path
 def transcribe_with_whisper(audio_path: str, language: str = None) -> str:
@@ -158,7 +176,10 @@ def generate_translated_audio(
     audio_files = []
     timings = []
-    tts = tts_models.get(target_lang)
     for i, sub in enumerate(tqdm(subs, desc=f"Generating {target_lang} audio")):
         text = sub.text.strip()
@@ -185,7 +206,7 @@ def generate_translated_audio(
     subprocess.run([
         'ffmpeg', '-f', 'lavfi', '-i', 'anullsrc=r=44100:cl=stereo',
         '-t', str(video_duration), '-y', silence_file
-    ], check=True)
     # Mix audio
     filter_complex = "[0:a]" + "".join(
@@ -200,7 +221,7 @@ def generate_translated_audio(
           '-map', '[aout]',
           os.path.join(OUTPUT_DIR, f"translated_audio_{target_lang}.wav")]
-    subprocess.run(' '.join(cmd), shell=True, check=True)
     shutil.rmtree(temp_dir)
     return os.path.join(OUTPUT_DIR, f"translated_audio_{target_lang}.wav")
@@ -329,7 +350,7 @@ def process_video(
                 'ffmpeg', '-i', base_video, '-i', translated_audio,
                 '-map', '0:v', '-map', '1:a', '-c:v', 'copy', '-c:a', 'aac',
                 '-y', output_video
-            ], check=True)
         # 7. Create HTML player
         progress(0.9, "Creating HTML player...")
@@ -450,12 +471,10 @@ def create_interface():
     return demo
 if __name__ == "__main__":
-    # Check requirements
-    try:
-        subprocess.run(["ffmpeg", "-version"], check=True)
-        import torch, whisper
-        demo = create_interface()
-        demo.launch()
-    except Exception as e:
-        print(f"Error: {str(e)}")
-        print("Please install all requirements: pip install -r requirements.txt")

 from TTS.api import TTS
 from deep_translator import GoogleTranslator
 import pysrt
+import whisper
 import webvtt
 import shutil
 import time
     "Black Background": "background-color: black; padding: 5px;"
 }
+# Create output directory (relative path for Spaces)
 OUTPUT_DIR = "outputs"
 os.makedirs(OUTPUT_DIR, exist_ok=True)
+# Initialize TTS with error handling
 device = "cuda" if torch.cuda.is_available() else "cpu"
+tts_models = {}
+def load_tts_model(model_name: str, lang_code: str):
+    try:
+        tts = TTS(model_name).to(device)
+        # Try to use gruut phonemizer if espeak fails
+        if hasattr(tts.synthesizer, 'tts_config'):
+            tts.synthesizer.tts_config.phonemizer = "gruut"
+        return tts
+    except Exception as e:
+        logger.error(f"Failed to load {model_name}: {str(e)}")
+        return None
+# Initialize models only when needed
+def get_tts_model(lang_code: str):
+    if lang_code not in tts_models:
+        model_map = {
+            "en": "tts_models/en/ljspeech/tacotron2-DDC",
+            "es": "tts_models/es/css10/vits",
+            "fr": "tts_models/fr/css10/vits",
+            "de": "tts_models/de/thorsten/vits",  # Using VITS instead of tacotron2
+            "ja": "tts_models/ja/kokoro/tacotron2-DDC",
+            "hi": "tts_models/hi/kb/tacotron2-DDC"
+        }
+        tts_models[lang_code] = load_tts_model(model_map[lang_code], lang_code)
+    return tts_models[lang_code]
 # Initialize Whisper (load when needed)
 whisper_model = None
         '-acodec', 'pcm_s16le', '-ar', '16000',
         '-ac', '1', '-y', audio_path
     ]
+    subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     return audio_path
 def transcribe_with_whisper(audio_path: str, language: str = None) -> str:
     audio_files = []
     timings = []
+    tts = get_tts_model(target_lang)
+    if tts is None:
+        raise Exception(f"TTS model for {target_lang} not available")
     for i, sub in enumerate(tqdm(subs, desc=f"Generating {target_lang} audio")):
         text = sub.text.strip()
     subprocess.run([
         'ffmpeg', '-f', 'lavfi', '-i', 'anullsrc=r=44100:cl=stereo',
         '-t', str(video_duration), '-y', silence_file
+    ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     # Mix audio
     filter_complex = "[0:a]" + "".join(
           '-map', '[aout]',
           os.path.join(OUTPUT_DIR, f"translated_audio_{target_lang}.wav")]
+    subprocess.run(' '.join(cmd), shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     shutil.rmtree(temp_dir)
     return os.path.join(OUTPUT_DIR, f"translated_audio_{target_lang}.wav")
                 'ffmpeg', '-i', base_video, '-i', translated_audio,
                 '-map', '0:v', '-map', '1:a', '-c:v', 'copy', '-c:a', 'aac',
                 '-y', output_video
+            ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         # 7. Create HTML player
         progress(0.9, "Creating HTML player...")
     return demo
 if __name__ == "__main__":
+    # Clear output directory on startup
+    if os.path.exists(OUTPUT_DIR):
+        shutil.rmtree(OUTPUT_DIR)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    demo = create_interface()
+    demo.launch(share=True)  # Required for Hugging Face Spaces