Spaces:

MonilM
/

Lingual

Running

App Files Files Community

MonilM commited on May 5

Commit

56f6115

1 Parent(s): 5546785

Hindi Support#2

Browse files

Files changed (2) hide show

requirements.txt +1 -0
speech_utils.py +44 -101

requirements.txt CHANGED Viewed

@@ -21,4 +21,5 @@ httpcore==1.0.9
 roboflow==1.1.63
 inference-gpu[yolo-world]==0.48.1  # Commented out due to numpy version conflicts
 git+https://github.com/ultralytics/CLIP.git

 roboflow==1.1.63
 inference-gpu[yolo-world]==0.48.1  # Commented out due to numpy version conflicts
 git+https://github.com/ultralytics/CLIP.git
+faster-whisper>=1.0.3

speech_utils.py CHANGED Viewed

@@ -6,6 +6,7 @@ from googletrans.client import Translator, LANGUAGES
 import logging
 import torch
 import asyncio # Import asyncio for await
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -22,14 +23,16 @@ def get_random_proxy():
 def build_translator_with_proxy():
     proxy = get_random_proxy()
     if proxy:
         proxy_url = f"http://{proxy}"
-        return Translator(
-            service_urls=['translate.googleapis.com'],
-            proxies={"http": proxy_url, "https": proxy_url}
-        )
-    else:
-        return Translator(service_urls=['translate.googleapis.com'])
 # Define supported languages (using short codes consistent with Whisper/googletrans)
 # Note: googletrans uses short codes like 'en', 'hi'. Whisper also detects these.
 SUPPORTED_LANGUAGES = {
@@ -52,135 +55,77 @@ if 'zh-cn' in SUPPORTED_LANGUAGES:
     SUPPORTED_LANGUAGES['zh'] = SUPPORTED_LANGUAGES['zh-cn']
-# Load the Whisper model
 try:
-    # Check for CUDA availability
     device = "cuda" if torch.cuda.is_available() else "cpu"
     logger.info(f"Using device: {device}")
-    # Load the model onto the appropriate device
-    model = whisper.load_model("base", device=device) # Using "base" model, can be changed
-    logger.info("Whisper model loaded successfully.")
 except Exception as e:
-    logger.error(f"Error loading Whisper model: {e}")
     model = None
 # Initialize the translator
 translator = build_translator_with_proxy()
 async def process_audio(audio_file_content: bytes, lang1: str, lang2: str):
-    """
-    Transcribes audio using Whisper, detects the language between lang1 and lang2
-    (if supported), and translates the text to the other language.
-    Args:
-        audio_file_content: The byte content of the audio file.
-        lang1: The first possible language code (must be in SUPPORTED_LANGUAGES).
-        lang2: The second possible language code (must be in SUPPORTED_LANGUAGES).
-    Returns:
-        A dictionary containing the detected language, transcribed text,
-        and translated text, or an error dictionary if processing fails.
-    """
     if not model:
-        logger.error("Whisper model is not loaded. Cannot process audio.")
-        return {"error": "Whisper model not available."}
-    # Validate input languages
-    if lang1 not in SUPPORTED_LANGUAGES:
-        logger.error(f"Input language '{lang1}' is not supported.")
-        return {"error": f"Input language '{lang1}' is not supported."}
-    if lang2 not in SUPPORTED_LANGUAGES:
-        logger.error(f"Input language '{lang2}' is not supported.")
-        return {"error": f"Input language '{lang2}' is not supported."}
-    if lang1 == lang2:
-         logger.error(f"Input languages cannot be the same: '{lang1}'.")
-         return {"error": f"Input languages cannot be the same: '{lang1}'."}
-    temp_audio_path = None # Initialize path variable
     try:
-        # Save the uploaded file content temporarily
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
             temp_audio.write(audio_file_content)
             temp_audio_path = temp_audio.name
         logger.info(f"Temporary audio file saved at: {temp_audio_path}")
-        # --- Whisper Transcription and Language Detection ---
-        audio = whisper.load_audio(temp_audio_path)
-        audio = whisper.pad_or_trim(audio)
-        mel = whisper.log_mel_spectrogram(audio).to(model.device)
-        # Detect the spoken language
-        _, probs = model.detect_language(mel)
-        detected_lang_code = max(probs, key=probs.get)
-        logger.info(f"Whisper detected language code: {detected_lang_code} with probability {probs[detected_lang_code]}")
-        # --- Language Validation ---
-        # 1. Check if detected language is broadly supported by this application
         if detected_lang_code not in SUPPORTED_LANGUAGES:
-            logger.error(f"Detected language '{detected_lang_code}' is not supported by this application.")
-            # Clean up before returning
-            if temp_audio_path and os.path.exists(temp_audio_path):
-                os.unlink(temp_audio_path)
-                logger.info(f"Temporary audio file deleted early due to unsupported language: {temp_audio_path}")
             return {"error": f"Detected language '{detected_lang_code}' is not supported."}
-        # 2. Check if the detected language is one of the two expected for this specific request
         if detected_lang_code not in [lang1, lang2]:
-             logger.error(f"Detected language '{detected_lang_code}' is not one of the expected languages [{lang1}, {lang2}].")
-             # Clean up before returning
-             if temp_audio_path and os.path.exists(temp_audio_path):
-                 os.unlink(temp_audio_path)
-                 logger.info(f"Temporary audio file deleted early due to unexpected language: {temp_audio_path}")
-             return {"error": f"Detected language '{detected_lang_code}' was not one of the expected languages: {lang1} or {lang2}."}
-        # --- Transcription ---
-        # Force Hindi transcription if detected language is Hindi
         if detected_lang_code == "hi":
-            options = whisper.DecodingOptions(language="hi", fp16=(device=="cuda"))
-            result = whisper.decode(model, mel, options)
-            transcribed_text = result.text
-            logger.info(f"Transcription (forced Hindi): {transcribed_text}")
-            # If output is mostly Latin, retry with forced Hindi
             latin_count = sum('a' <= c.lower() <= 'z' for c in transcribed_text)
             devanagari_count = sum('\u0900' <= c <= '\u097F' for c in transcribed_text)
             if latin_count > devanagari_count:
-                logger.info("Transcription appears to be in Latin script, retrying with forced Hindi language.")
-                options = whisper.DecodingOptions(language="hi", fp16=(device=="cuda"), task="transcribe")
-                result = whisper.decode(model, mel, options)
-                transcribed_text = result.text
                 logger.info(f"Retried Hindi transcription: {transcribed_text}")
-        else:
-            options = whisper.DecodingOptions(language=detected_lang_code, fp16=(device=="cuda"))
-            result = whisper.decode(model, mel, options)
-            transcribed_text = result.text
-            logger.info(f"Transcription: {transcribed_text}")
-        # Determine the target language for translation
         target_lang = lang2 if detected_lang_code == lang1 else lang1
-        logger.info(f"Target language for translation: {target_lang}")
-        # --- Translation ---
-        translated_text = "Translation not applicable or failed."
         if transcribed_text:
             try:
-                # *** Use await for the async translate function ***
                 translation = await translator.translate(transcribed_text, src=detected_lang_code, dest=target_lang)
-                # Check if translation object is valid before accessing .text
                 if translation and hasattr(translation, 'text'):
                     translated_text = translation.text
                     logger.info(f"Translation to {target_lang}: {translated_text}")
                 else:
-                     logger.error(f"Translation result invalid: {translation}")
-                     translated_text = "Translation failed (invalid result)."
             except Exception as e:
-                logger.error(f"Error during translation: {e}", exc_info=True) # Log traceback
                 translated_text = f"Translation failed: {e}"
         else:
-            logger.info("Transcription was empty, skipping translation.")
-            translated_text = "Transcription was empty." # Provide clearer status
         return {
             "detected_language": detected_lang_code,
@@ -189,15 +134,13 @@ async def process_audio(audio_file_content: bytes, lang1: str, lang2: str):
         }
     except Exception as e:
-        logger.error(f"Error processing audio file: {e}", exc_info=True)
-        # Ensure error message is propagated
         return {"error": f"An unexpected error occurred during audio processing: {e}"}
     finally:
-        # Clean up the temporary file
         if temp_audio_path and os.path.exists(temp_audio_path):
             try:
                 os.unlink(temp_audio_path)
                 logger.info(f"Temporary audio file deleted: {temp_audio_path}")
             except Exception as e:
-                logger.error(f"Error deleting temporary file {temp_audio_path}: {e}")

 import logging
 import torch
 import asyncio # Import asyncio for await
+from faster_whisper import WhisperModel
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 def build_translator_with_proxy():
     proxy = get_random_proxy()
+    translator = Translator(service_urls=['translate.googleapis.com'])
     if proxy:
         proxy_url = f"http://{proxy}"
+        # Set proxies on the underlying requests session
+        translator.session.proxies = {
+            "http": proxy_url,
+            "https": proxy_url
+        }
+    return translator
 # Define supported languages (using short codes consistent with Whisper/googletrans)
 # Note: googletrans uses short codes like 'en', 'hi'. Whisper also detects these.
 SUPPORTED_LANGUAGES = {
     SUPPORTED_LANGUAGES['zh'] = SUPPORTED_LANGUAGES['zh-cn']
+# Load faster-whisper model
 try:
     device = "cuda" if torch.cuda.is_available() else "cpu"
     logger.info(f"Using device: {device}")
+    model_size = "base"
+    model = WhisperModel(model_size, device=device,
+                         compute_type="float16" if device == "cuda" else "int8",
+                         num_workers=8)
+    logger.info("Faster-Whisper model loaded successfully.")
 except Exception as e:
+    logger.error(f"Error loading Faster-Whisper model: {e}")
     model = None
 # Initialize the translator
 translator = build_translator_with_proxy()
 async def process_audio(audio_file_content: bytes, lang1: str, lang2: str):
     if not model:
+        return {"error": "Faster-Whisper model not available."}
+    if lang1 not in SUPPORTED_LANGUAGES or lang2 not in SUPPORTED_LANGUAGES or lang1 == lang2:
+        return {"error": "Invalid or duplicate input languages."}
+    temp_audio_path = None
     try:
+        # Save temp audio
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
             temp_audio.write(audio_file_content)
             temp_audio_path = temp_audio.name
         logger.info(f"Temporary audio file saved at: {temp_audio_path}")
+        # Transcribe using faster-whisper (auto language detect)
+        segments, info = model.transcribe(temp_audio_path, beam_size=5, language=None)
+        detected_lang_code = info.language
+        logger.info(f"Detected language: {detected_lang_code}")
         if detected_lang_code not in SUPPORTED_LANGUAGES:
             return {"error": f"Detected language '{detected_lang_code}' is not supported."}
         if detected_lang_code not in [lang1, lang2]:
+            return {"error": f"Detected language '{detected_lang_code}' was not one of the expected languages: {lang1} or {lang2}."}
+        # Join all transcribed segments
+        transcribed_text = " ".join([segment.text for segment in segments])
+        logger.info(f"Transcription: {transcribed_text}")
+        # Optional forced Hindi fallback
         if detected_lang_code == "hi":
             latin_count = sum('a' <= c.lower() <= 'z' for c in transcribed_text)
             devanagari_count = sum('\u0900' <= c <= '\u097F' for c in transcribed_text)
             if latin_count > devanagari_count:
+                logger.info("Transcription appears to be in Latin script, retrying with forced Hindi decoding.")
+                segments, _ = model.transcribe(temp_audio_path, language="hi", beam_size=5)
+                transcribed_text = " ".join([segment.text for segment in segments])
                 logger.info(f"Retried Hindi transcription: {transcribed_text}")
+        # Translate
         target_lang = lang2 if detected_lang_code == lang1 else lang1
+        translated_text = "Translation not applicable."
         if transcribed_text:
             try:
                 translation = await translator.translate(transcribed_text, src=detected_lang_code, dest=target_lang)
                 if translation and hasattr(translation, 'text'):
                     translated_text = translation.text
                     logger.info(f"Translation to {target_lang}: {translated_text}")
                 else:
+                    translated_text = "Translation failed (invalid result)."
             except Exception as e:
+                logger.error(f"Translation error: {e}", exc_info=True)
                 translated_text = f"Translation failed: {e}"
         else:
+            translated_text = "Transcription was empty."
         return {
             "detected_language": detected_lang_code,
         }
     except Exception as e:
+        logger.error(f"Audio processing error: {e}", exc_info=True)
         return {"error": f"An unexpected error occurred during audio processing: {e}"}
     finally:
         if temp_audio_path and os.path.exists(temp_audio_path):
             try:
                 os.unlink(temp_audio_path)
                 logger.info(f"Temporary audio file deleted: {temp_audio_path}")
             except Exception as e:
+                logger.error(f"Failed to delete temp file: {e}")