Spaces:

MonilM
/

Lingual

Running

App Files Files Community

Lingual / speech_utils.py

MonilM

Final#2

4e65b52 3 months ago

raw

history blame contribute delete

9.11 kB

	import whisper
	import tempfile
	import os
	# Use the async version of googletrans client
	from googletrans.client import Translator, LANGUAGES
	import logging
	import torch
	import asyncio # Import asyncio for await
	from faster_whisper import WhisperModel

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)
	import random

	def get_random_proxy():
	proxies_path = os.path.join(os.path.dirname(__file__), "proxies.txt")
	with open(proxies_path, "r") as f:
	proxies = [line.strip() for line in f if line.strip()]
	if not proxies:
	return None
	return random.choice(proxies)

	def build_translator_with_proxy():
	return Translator(service_urls=['translate.googleapis.com'])

	# Define supported languages (using short codes consistent with Whisper/googletrans)
	# Note: googletrans uses short codes like 'en', 'hi'. Whisper also detects these.
	SUPPORTED_LANGUAGES = {
	'hi': {'name': 'Hindi'},
	'en': {'name': 'English'},
	'es': {'name': 'Spanish'},
	'fr': {'name': 'French'},
	'de': {'name': 'German'},
	'ja': {'name': 'Japanese'},
	'ko': {'name': 'Korean'},
	'zh-cn': {'name': 'Chinese (Simplified)'}, # googletrans uses zh-cn
	'ar': {'name': 'Arabic'},
	'ru': {'name': 'Russian'},
	'gu': {'name': 'Gujarati'},
	'mr': {'name': 'Marathi'},
	'kn': {'name': 'Kannada'}
	}
	# Add simplified Chinese 'zh' mapping if Whisper detects 'zh'
	if 'zh-cn' in SUPPORTED_LANGUAGES:
	SUPPORTED_LANGUAGES['zh'] = SUPPORTED_LANGUAGES['zh-cn']


	# Load faster-whisper model
	try:
	device = "cuda" if torch.cuda.is_available() else "cpu"
	logger.info(f"Using device: {device}")
	model_size = "medium"
	model = WhisperModel(model_size, device=device,
	compute_type="float16" if device == "cuda" else "int8",
	num_workers=8)
	logger.info("Faster-Whisper model loaded successfully.")
	except Exception as e:
	logger.error(f"Error loading Faster-Whisper model: {e}")
	model = None

	# Initialize the translator
	translator = build_translator_with_proxy()

	async def process_audio(audio_file_content: bytes, lang1: str, lang2: str):
	if not model:
	return {"error": "Faster-Whisper model not available."}

	if lang1 not in SUPPORTED_LANGUAGES or lang2 not in SUPPORTED_LANGUAGES or lang1 == lang2:
	return {"error": "Invalid or duplicate input languages."}

	temp_audio_path = None
	try:
	# Save temp audio
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
	temp_audio.write(audio_file_content)
	temp_audio_path = temp_audio.name
	logger.info(f"Temporary audio file saved at: {temp_audio_path}")

	# Transcribe using faster-whisper (auto language detect)
	segments, info = model.transcribe(temp_audio_path, beam_size=5, language=None)
	detected_lang_code = info.language
	logger.info(f"Detected language: {detected_lang_code}")

	# Join all transcribed segments
	transcribed_text = " ".join([segment.text for segment in segments])
	logger.info(f"Transcription: {transcribed_text}")

	# Map similar language codes to expected ones for better translation support
	# Bengali/Sanskrit (bn/sa) to Gujarati if user expects Gujarati
	if detected_lang_code in ["bn", "sa"] and ("gu" in [lang1, lang2]):
	logger.info(f"Mapping detected language '{detected_lang_code}' to 'gu' for Gujarati support.")
	detected_lang_code = "gu"
	# Portuguese (pt) to Marathi if user expects Marathi
	if detected_lang_code == "pt" and ("mr" in [lang1, lang2]):
	logger.info("Mapping detected language 'pt' to 'mr' for Marathi support.")
	detected_lang_code = "mr"
	# Urdu (ur) to Hindi if user expects Hindi
	if detected_lang_code == "ur" and ("hi" in [lang1, lang2]):
	logger.info("Mapping detected language 'ur' to 'hi' for Hindi support.")
	detected_lang_code = "hi"
	# Hindi (hi) to Marathi if user expects Marathi and Devanagari script is present
	if detected_lang_code == "hi" and ("mr" in [lang1, lang2]):
	devanagari_count = sum('\u0900' <= c <= '\u097F' for c in transcribed_text)
	if devanagari_count > 0:
	logger.info("Mapping detected language 'hi' to 'mr' for Marathi support (Devanagari script detected).")
	detected_lang_code = "mr"
	# Urdu (ur) to Hindi if user expects Hindi and Devanagari script is present
	if detected_lang_code == "ur" and ("hi" in [lang1, lang2]):
	devanagari_count = sum('\u0900' <= c <= '\u097F' for c in transcribed_text)
	if devanagari_count > 0:
	logger.info("Mapping detected language 'ur' to 'hi' for Hindi support (Devanagari script detected).")
	detected_lang_code = "hi"

	if detected_lang_code not in SUPPORTED_LANGUAGES:
	return {"error": f"Detected language '{detected_lang_code}' is not supported."}
	if detected_lang_code not in [lang1, lang2]:
	return {"error": f"Detected language '{detected_lang_code}' was not one of the expected languages: {lang1} or {lang2}."}

	# Optional forced Hindi fallback
	if detected_lang_code == "hi":
	latin_count = sum('a' <= c.lower() <= 'z' for c in transcribed_text)
	devanagari_count = sum('\u0900' <= c <= '\u097F' for c in transcribed_text)
	if latin_count > devanagari_count:
	logger.info("Transcription appears to be in Latin script, retrying with forced Hindi decoding.")
	segments, _ = model.transcribe(temp_audio_path, language="hi", beam_size=5)
	transcribed_text = " ".join([segment.text for segment in segments])
	logger.info(f"Retried Hindi transcription: {transcribed_text}")

	# Force language for Marathi or Gujarati if script matches but detection is ambiguous
	# Marathi: Devanagari script, user selected 'mr', but detected 'hi'
	if (('mr' in [lang1, lang2]) and detected_lang_code == 'hi'):
	devanagari_count = sum('\u0900' <= c <= '\u097F' for c in transcribed_text)
	latin_count = sum('a' <= c.lower() <= 'z' for c in transcribed_text)
	# If Devanagari is present and user expects Marathi, retry with 'mr'
	if devanagari_count > 0:
	logger.info("Detected Devanagari script and Marathi is expected, retrying with forced Marathi decoding.")
	segments, _ = model.transcribe(temp_audio_path, language="mr", beam_size=5)
	transcribed_text = " ".join([segment.text for segment in segments])
	detected_lang_code = 'mr'
	logger.info(f"Retried Marathi transcription: {transcribed_text}")

	# Gujarati: Gujarati script, user selected 'gu', but detected not 'gu'
	if (('gu' in [lang1, lang2]) and detected_lang_code != 'gu'):
	gujarati_count = sum('\u0A80' <= c <= '\u0AFF' for c in transcribed_text)
	if gujarati_count > 0:
	logger.info("Detected Gujarati script and Gujarati is expected, retrying with forced Gujarati decoding.")
	segments, _ = model.transcribe(temp_audio_path, language="gu", beam_size=5)
	transcribed_text = " ".join([segment.text for segment in segments])
	detected_lang_code = 'gu'
	logger.info(f"Retried Gujarati transcription: {transcribed_text}")

	# Translate
	target_lang = lang2 if detected_lang_code == lang1 else lang1
	translated_text = "Translation not applicable."
	if transcribed_text:
	try:
	translation = await translator.translate(transcribed_text, src=detected_lang_code, dest=target_lang)
	if translation and hasattr(translation, 'text'):
	translated_text = translation.text
	logger.info(f"Translation to {target_lang}: {translated_text}")
	else:
	translated_text = "Translation failed (invalid result)."
	except Exception as e:
	logger.error(f"Translation error: {e}", exc_info=True)
	translated_text = f"Translation failed: {e}"
	else:
	translated_text = "Transcription was empty."

	return {
	"detected_language": detected_lang_code,
	"transcribed_text": transcribed_text,
	"translated_text": translated_text
	}

	except Exception as e:
	logger.error(f"Audio processing error: {e}", exc_info=True)
	return {"error": f"An unexpected error occurred during audio processing: {e}"}
	finally:
	if temp_audio_path and os.path.exists(temp_audio_path):
	try:
	os.unlink(temp_audio_path)
	logger.info(f"Temporary audio file deleted: {temp_audio_path}")
	except Exception as e:
	logger.error(f"Failed to delete temp file: {e}")