Lingual / speech_utils.py
MonilM's picture
Final#2
4e65b52
import whisper
import tempfile
import os
# Use the async version of googletrans client
from googletrans.client import Translator, LANGUAGES
import logging
import torch
import asyncio # Import asyncio for await
from faster_whisper import WhisperModel
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
import random
def get_random_proxy():
proxies_path = os.path.join(os.path.dirname(__file__), "proxies.txt")
with open(proxies_path, "r") as f:
proxies = [line.strip() for line in f if line.strip()]
if not proxies:
return None
return random.choice(proxies)
def build_translator_with_proxy():
return Translator(service_urls=['translate.googleapis.com'])
# Define supported languages (using short codes consistent with Whisper/googletrans)
# Note: googletrans uses short codes like 'en', 'hi'. Whisper also detects these.
SUPPORTED_LANGUAGES = {
'hi': {'name': 'Hindi'},
'en': {'name': 'English'},
'es': {'name': 'Spanish'},
'fr': {'name': 'French'},
'de': {'name': 'German'},
'ja': {'name': 'Japanese'},
'ko': {'name': 'Korean'},
'zh-cn': {'name': 'Chinese (Simplified)'}, # googletrans uses zh-cn
'ar': {'name': 'Arabic'},
'ru': {'name': 'Russian'},
'gu': {'name': 'Gujarati'},
'mr': {'name': 'Marathi'},
'kn': {'name': 'Kannada'}
}
# Add simplified Chinese 'zh' mapping if Whisper detects 'zh'
if 'zh-cn' in SUPPORTED_LANGUAGES:
SUPPORTED_LANGUAGES['zh'] = SUPPORTED_LANGUAGES['zh-cn']
# Load faster-whisper model
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")
model_size = "medium"
model = WhisperModel(model_size, device=device,
compute_type="float16" if device == "cuda" else "int8",
num_workers=8)
logger.info("Faster-Whisper model loaded successfully.")
except Exception as e:
logger.error(f"Error loading Faster-Whisper model: {e}")
model = None
# Initialize the translator
translator = build_translator_with_proxy()
async def process_audio(audio_file_content: bytes, lang1: str, lang2: str):
if not model:
return {"error": "Faster-Whisper model not available."}
if lang1 not in SUPPORTED_LANGUAGES or lang2 not in SUPPORTED_LANGUAGES or lang1 == lang2:
return {"error": "Invalid or duplicate input languages."}
temp_audio_path = None
try:
# Save temp audio
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
temp_audio.write(audio_file_content)
temp_audio_path = temp_audio.name
logger.info(f"Temporary audio file saved at: {temp_audio_path}")
# Transcribe using faster-whisper (auto language detect)
segments, info = model.transcribe(temp_audio_path, beam_size=5, language=None)
detected_lang_code = info.language
logger.info(f"Detected language: {detected_lang_code}")
# Join all transcribed segments
transcribed_text = " ".join([segment.text for segment in segments])
logger.info(f"Transcription: {transcribed_text}")
# Map similar language codes to expected ones for better translation support
# Bengali/Sanskrit (bn/sa) to Gujarati if user expects Gujarati
if detected_lang_code in ["bn", "sa"] and ("gu" in [lang1, lang2]):
logger.info(f"Mapping detected language '{detected_lang_code}' to 'gu' for Gujarati support.")
detected_lang_code = "gu"
# Portuguese (pt) to Marathi if user expects Marathi
if detected_lang_code == "pt" and ("mr" in [lang1, lang2]):
logger.info("Mapping detected language 'pt' to 'mr' for Marathi support.")
detected_lang_code = "mr"
# Urdu (ur) to Hindi if user expects Hindi
if detected_lang_code == "ur" and ("hi" in [lang1, lang2]):
logger.info("Mapping detected language 'ur' to 'hi' for Hindi support.")
detected_lang_code = "hi"
# Hindi (hi) to Marathi if user expects Marathi and Devanagari script is present
if detected_lang_code == "hi" and ("mr" in [lang1, lang2]):
devanagari_count = sum('\u0900' <= c <= '\u097F' for c in transcribed_text)
if devanagari_count > 0:
logger.info("Mapping detected language 'hi' to 'mr' for Marathi support (Devanagari script detected).")
detected_lang_code = "mr"
# Urdu (ur) to Hindi if user expects Hindi and Devanagari script is present
if detected_lang_code == "ur" and ("hi" in [lang1, lang2]):
devanagari_count = sum('\u0900' <= c <= '\u097F' for c in transcribed_text)
if devanagari_count > 0:
logger.info("Mapping detected language 'ur' to 'hi' for Hindi support (Devanagari script detected).")
detected_lang_code = "hi"
if detected_lang_code not in SUPPORTED_LANGUAGES:
return {"error": f"Detected language '{detected_lang_code}' is not supported."}
if detected_lang_code not in [lang1, lang2]:
return {"error": f"Detected language '{detected_lang_code}' was not one of the expected languages: {lang1} or {lang2}."}
# Optional forced Hindi fallback
if detected_lang_code == "hi":
latin_count = sum('a' <= c.lower() <= 'z' for c in transcribed_text)
devanagari_count = sum('\u0900' <= c <= '\u097F' for c in transcribed_text)
if latin_count > devanagari_count:
logger.info("Transcription appears to be in Latin script, retrying with forced Hindi decoding.")
segments, _ = model.transcribe(temp_audio_path, language="hi", beam_size=5)
transcribed_text = " ".join([segment.text for segment in segments])
logger.info(f"Retried Hindi transcription: {transcribed_text}")
# Force language for Marathi or Gujarati if script matches but detection is ambiguous
# Marathi: Devanagari script, user selected 'mr', but detected 'hi'
if (('mr' in [lang1, lang2]) and detected_lang_code == 'hi'):
devanagari_count = sum('\u0900' <= c <= '\u097F' for c in transcribed_text)
latin_count = sum('a' <= c.lower() <= 'z' for c in transcribed_text)
# If Devanagari is present and user expects Marathi, retry with 'mr'
if devanagari_count > 0:
logger.info("Detected Devanagari script and Marathi is expected, retrying with forced Marathi decoding.")
segments, _ = model.transcribe(temp_audio_path, language="mr", beam_size=5)
transcribed_text = " ".join([segment.text for segment in segments])
detected_lang_code = 'mr'
logger.info(f"Retried Marathi transcription: {transcribed_text}")
# Gujarati: Gujarati script, user selected 'gu', but detected not 'gu'
if (('gu' in [lang1, lang2]) and detected_lang_code != 'gu'):
gujarati_count = sum('\u0A80' <= c <= '\u0AFF' for c in transcribed_text)
if gujarati_count > 0:
logger.info("Detected Gujarati script and Gujarati is expected, retrying with forced Gujarati decoding.")
segments, _ = model.transcribe(temp_audio_path, language="gu", beam_size=5)
transcribed_text = " ".join([segment.text for segment in segments])
detected_lang_code = 'gu'
logger.info(f"Retried Gujarati transcription: {transcribed_text}")
# Translate
target_lang = lang2 if detected_lang_code == lang1 else lang1
translated_text = "Translation not applicable."
if transcribed_text:
try:
translation = await translator.translate(transcribed_text, src=detected_lang_code, dest=target_lang)
if translation and hasattr(translation, 'text'):
translated_text = translation.text
logger.info(f"Translation to {target_lang}: {translated_text}")
else:
translated_text = "Translation failed (invalid result)."
except Exception as e:
logger.error(f"Translation error: {e}", exc_info=True)
translated_text = f"Translation failed: {e}"
else:
translated_text = "Transcription was empty."
return {
"detected_language": detected_lang_code,
"transcribed_text": transcribed_text,
"translated_text": translated_text
}
except Exception as e:
logger.error(f"Audio processing error: {e}", exc_info=True)
return {"error": f"An unexpected error occurred during audio processing: {e}"}
finally:
if temp_audio_path and os.path.exists(temp_audio_path):
try:
os.unlink(temp_audio_path)
logger.info(f"Temporary audio file deleted: {temp_audio_path}")
except Exception as e:
logger.error(f"Failed to delete temp file: {e}")