MonilM commited on
Commit
56f6115
·
1 Parent(s): 5546785

Hindi Support#2

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -0
  2. speech_utils.py +44 -101
requirements.txt CHANGED
@@ -21,4 +21,5 @@ httpcore==1.0.9
21
  roboflow==1.1.63
22
  inference-gpu[yolo-world]==0.48.1 # Commented out due to numpy version conflicts
23
  git+https://github.com/ultralytics/CLIP.git
 
24
 
 
21
  roboflow==1.1.63
22
  inference-gpu[yolo-world]==0.48.1 # Commented out due to numpy version conflicts
23
  git+https://github.com/ultralytics/CLIP.git
24
+ faster-whisper>=1.0.3
25
 
speech_utils.py CHANGED
@@ -6,6 +6,7 @@ from googletrans.client import Translator, LANGUAGES
6
  import logging
7
  import torch
8
  import asyncio # Import asyncio for await
 
9
 
10
  # Configure logging
11
  logging.basicConfig(level=logging.INFO)
@@ -22,14 +23,16 @@ def get_random_proxy():
22
 
23
  def build_translator_with_proxy():
24
  proxy = get_random_proxy()
 
25
  if proxy:
26
  proxy_url = f"http://{proxy}"
27
- return Translator(
28
- service_urls=['translate.googleapis.com'],
29
- proxies={"http": proxy_url, "https": proxy_url}
30
- )
31
- else:
32
- return Translator(service_urls=['translate.googleapis.com'])
 
33
  # Define supported languages (using short codes consistent with Whisper/googletrans)
34
  # Note: googletrans uses short codes like 'en', 'hi'. Whisper also detects these.
35
  SUPPORTED_LANGUAGES = {
@@ -52,135 +55,77 @@ if 'zh-cn' in SUPPORTED_LANGUAGES:
52
  SUPPORTED_LANGUAGES['zh'] = SUPPORTED_LANGUAGES['zh-cn']
53
 
54
 
55
- # Load the Whisper model
56
  try:
57
- # Check for CUDA availability
58
  device = "cuda" if torch.cuda.is_available() else "cpu"
59
  logger.info(f"Using device: {device}")
60
- # Load the model onto the appropriate device
61
- model = whisper.load_model("base", device=device) # Using "base" model, can be changed
62
- logger.info("Whisper model loaded successfully.")
 
 
63
  except Exception as e:
64
- logger.error(f"Error loading Whisper model: {e}")
65
  model = None
66
 
67
  # Initialize the translator
68
  translator = build_translator_with_proxy()
69
 
70
  async def process_audio(audio_file_content: bytes, lang1: str, lang2: str):
71
- """
72
- Transcribes audio using Whisper, detects the language between lang1 and lang2
73
- (if supported), and translates the text to the other language.
74
-
75
- Args:
76
- audio_file_content: The byte content of the audio file.
77
- lang1: The first possible language code (must be in SUPPORTED_LANGUAGES).
78
- lang2: The second possible language code (must be in SUPPORTED_LANGUAGES).
79
-
80
- Returns:
81
- A dictionary containing the detected language, transcribed text,
82
- and translated text, or an error dictionary if processing fails.
83
- """
84
  if not model:
85
- logger.error("Whisper model is not loaded. Cannot process audio.")
86
- return {"error": "Whisper model not available."}
87
-
88
- # Validate input languages
89
- if lang1 not in SUPPORTED_LANGUAGES:
90
- logger.error(f"Input language '{lang1}' is not supported.")
91
- return {"error": f"Input language '{lang1}' is not supported."}
92
- if lang2 not in SUPPORTED_LANGUAGES:
93
- logger.error(f"Input language '{lang2}' is not supported.")
94
- return {"error": f"Input language '{lang2}' is not supported."}
95
- if lang1 == lang2:
96
- logger.error(f"Input languages cannot be the same: '{lang1}'.")
97
- return {"error": f"Input languages cannot be the same: '{lang1}'."}
98
-
99
-
100
- temp_audio_path = None # Initialize path variable
101
  try:
102
- # Save the uploaded file content temporarily
103
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
104
  temp_audio.write(audio_file_content)
105
  temp_audio_path = temp_audio.name
106
  logger.info(f"Temporary audio file saved at: {temp_audio_path}")
107
 
 
 
 
 
108
 
109
- # --- Whisper Transcription and Language Detection ---
110
- audio = whisper.load_audio(temp_audio_path)
111
- audio = whisper.pad_or_trim(audio)
112
- mel = whisper.log_mel_spectrogram(audio).to(model.device)
113
-
114
- # Detect the spoken language
115
- _, probs = model.detect_language(mel)
116
- detected_lang_code = max(probs, key=probs.get)
117
- logger.info(f"Whisper detected language code: {detected_lang_code} with probability {probs[detected_lang_code]}")
118
-
119
- # --- Language Validation ---
120
- # 1. Check if detected language is broadly supported by this application
121
  if detected_lang_code not in SUPPORTED_LANGUAGES:
122
- logger.error(f"Detected language '{detected_lang_code}' is not supported by this application.")
123
- # Clean up before returning
124
- if temp_audio_path and os.path.exists(temp_audio_path):
125
- os.unlink(temp_audio_path)
126
- logger.info(f"Temporary audio file deleted early due to unsupported language: {temp_audio_path}")
127
  return {"error": f"Detected language '{detected_lang_code}' is not supported."}
128
-
129
- # 2. Check if the detected language is one of the two expected for this specific request
130
  if detected_lang_code not in [lang1, lang2]:
131
- logger.error(f"Detected language '{detected_lang_code}' is not one of the expected languages [{lang1}, {lang2}].")
132
- # Clean up before returning
133
- if temp_audio_path and os.path.exists(temp_audio_path):
134
- os.unlink(temp_audio_path)
135
- logger.info(f"Temporary audio file deleted early due to unexpected language: {temp_audio_path}")
136
- return {"error": f"Detected language '{detected_lang_code}' was not one of the expected languages: {lang1} or {lang2}."}
137
-
138
- # --- Transcription ---
139
- # Force Hindi transcription if detected language is Hindi
140
  if detected_lang_code == "hi":
141
- options = whisper.DecodingOptions(language="hi", fp16=(device=="cuda"))
142
- result = whisper.decode(model, mel, options)
143
- transcribed_text = result.text
144
- logger.info(f"Transcription (forced Hindi): {transcribed_text}")
145
- # If output is mostly Latin, retry with forced Hindi
146
  latin_count = sum('a' <= c.lower() <= 'z' for c in transcribed_text)
147
  devanagari_count = sum('\u0900' <= c <= '\u097F' for c in transcribed_text)
148
  if latin_count > devanagari_count:
149
- logger.info("Transcription appears to be in Latin script, retrying with forced Hindi language.")
150
- options = whisper.DecodingOptions(language="hi", fp16=(device=="cuda"), task="transcribe")
151
- result = whisper.decode(model, mel, options)
152
- transcribed_text = result.text
153
  logger.info(f"Retried Hindi transcription: {transcribed_text}")
154
- else:
155
- options = whisper.DecodingOptions(language=detected_lang_code, fp16=(device=="cuda"))
156
- result = whisper.decode(model, mel, options)
157
- transcribed_text = result.text
158
- logger.info(f"Transcription: {transcribed_text}")
159
 
160
- # Determine the target language for translation
161
  target_lang = lang2 if detected_lang_code == lang1 else lang1
162
- logger.info(f"Target language for translation: {target_lang}")
163
-
164
- # --- Translation ---
165
- translated_text = "Translation not applicable or failed."
166
  if transcribed_text:
167
  try:
168
- # *** Use await for the async translate function ***
169
  translation = await translator.translate(transcribed_text, src=detected_lang_code, dest=target_lang)
170
- # Check if translation object is valid before accessing .text
171
  if translation and hasattr(translation, 'text'):
172
  translated_text = translation.text
173
  logger.info(f"Translation to {target_lang}: {translated_text}")
174
  else:
175
- logger.error(f"Translation result invalid: {translation}")
176
- translated_text = "Translation failed (invalid result)."
177
  except Exception as e:
178
- logger.error(f"Error during translation: {e}", exc_info=True) # Log traceback
179
  translated_text = f"Translation failed: {e}"
180
  else:
181
- logger.info("Transcription was empty, skipping translation.")
182
- translated_text = "Transcription was empty." # Provide clearer status
183
-
184
 
185
  return {
186
  "detected_language": detected_lang_code,
@@ -189,15 +134,13 @@ async def process_audio(audio_file_content: bytes, lang1: str, lang2: str):
189
  }
190
 
191
  except Exception as e:
192
- logger.error(f"Error processing audio file: {e}", exc_info=True)
193
- # Ensure error message is propagated
194
  return {"error": f"An unexpected error occurred during audio processing: {e}"}
195
  finally:
196
- # Clean up the temporary file
197
  if temp_audio_path and os.path.exists(temp_audio_path):
198
  try:
199
  os.unlink(temp_audio_path)
200
  logger.info(f"Temporary audio file deleted: {temp_audio_path}")
201
  except Exception as e:
202
- logger.error(f"Error deleting temporary file {temp_audio_path}: {e}")
203
 
 
6
  import logging
7
  import torch
8
  import asyncio # Import asyncio for await
9
+ from faster_whisper import WhisperModel
10
 
11
  # Configure logging
12
  logging.basicConfig(level=logging.INFO)
 
23
 
24
  def build_translator_with_proxy():
25
  proxy = get_random_proxy()
26
+ translator = Translator(service_urls=['translate.googleapis.com'])
27
  if proxy:
28
  proxy_url = f"http://{proxy}"
29
+ # Set proxies on the underlying requests session
30
+ translator.session.proxies = {
31
+ "http": proxy_url,
32
+ "https": proxy_url
33
+ }
34
+ return translator
35
+
36
  # Define supported languages (using short codes consistent with Whisper/googletrans)
37
  # Note: googletrans uses short codes like 'en', 'hi'. Whisper also detects these.
38
  SUPPORTED_LANGUAGES = {
 
55
  SUPPORTED_LANGUAGES['zh'] = SUPPORTED_LANGUAGES['zh-cn']
56
 
57
 
58
+ # Load faster-whisper model
59
  try:
 
60
  device = "cuda" if torch.cuda.is_available() else "cpu"
61
  logger.info(f"Using device: {device}")
62
+ model_size = "base"
63
+ model = WhisperModel(model_size, device=device,
64
+ compute_type="float16" if device == "cuda" else "int8",
65
+ num_workers=8)
66
+ logger.info("Faster-Whisper model loaded successfully.")
67
  except Exception as e:
68
+ logger.error(f"Error loading Faster-Whisper model: {e}")
69
  model = None
70
 
71
  # Initialize the translator
72
  translator = build_translator_with_proxy()
73
 
74
  async def process_audio(audio_file_content: bytes, lang1: str, lang2: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  if not model:
76
+ return {"error": "Faster-Whisper model not available."}
77
+
78
+ if lang1 not in SUPPORTED_LANGUAGES or lang2 not in SUPPORTED_LANGUAGES or lang1 == lang2:
79
+ return {"error": "Invalid or duplicate input languages."}
80
+
81
+ temp_audio_path = None
 
 
 
 
 
 
 
 
 
 
82
  try:
83
+ # Save temp audio
84
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
85
  temp_audio.write(audio_file_content)
86
  temp_audio_path = temp_audio.name
87
  logger.info(f"Temporary audio file saved at: {temp_audio_path}")
88
 
89
+ # Transcribe using faster-whisper (auto language detect)
90
+ segments, info = model.transcribe(temp_audio_path, beam_size=5, language=None)
91
+ detected_lang_code = info.language
92
+ logger.info(f"Detected language: {detected_lang_code}")
93
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  if detected_lang_code not in SUPPORTED_LANGUAGES:
 
 
 
 
 
95
  return {"error": f"Detected language '{detected_lang_code}' is not supported."}
 
 
96
  if detected_lang_code not in [lang1, lang2]:
97
+ return {"error": f"Detected language '{detected_lang_code}' was not one of the expected languages: {lang1} or {lang2}."}
98
+
99
+ # Join all transcribed segments
100
+ transcribed_text = " ".join([segment.text for segment in segments])
101
+ logger.info(f"Transcription: {transcribed_text}")
102
+
103
+ # Optional forced Hindi fallback
 
 
104
  if detected_lang_code == "hi":
 
 
 
 
 
105
  latin_count = sum('a' <= c.lower() <= 'z' for c in transcribed_text)
106
  devanagari_count = sum('\u0900' <= c <= '\u097F' for c in transcribed_text)
107
  if latin_count > devanagari_count:
108
+ logger.info("Transcription appears to be in Latin script, retrying with forced Hindi decoding.")
109
+ segments, _ = model.transcribe(temp_audio_path, language="hi", beam_size=5)
110
+ transcribed_text = " ".join([segment.text for segment in segments])
 
111
  logger.info(f"Retried Hindi transcription: {transcribed_text}")
 
 
 
 
 
112
 
113
+ # Translate
114
  target_lang = lang2 if detected_lang_code == lang1 else lang1
115
+ translated_text = "Translation not applicable."
 
 
 
116
  if transcribed_text:
117
  try:
 
118
  translation = await translator.translate(transcribed_text, src=detected_lang_code, dest=target_lang)
 
119
  if translation and hasattr(translation, 'text'):
120
  translated_text = translation.text
121
  logger.info(f"Translation to {target_lang}: {translated_text}")
122
  else:
123
+ translated_text = "Translation failed (invalid result)."
 
124
  except Exception as e:
125
+ logger.error(f"Translation error: {e}", exc_info=True)
126
  translated_text = f"Translation failed: {e}"
127
  else:
128
+ translated_text = "Transcription was empty."
 
 
129
 
130
  return {
131
  "detected_language": detected_lang_code,
 
134
  }
135
 
136
  except Exception as e:
137
+ logger.error(f"Audio processing error: {e}", exc_info=True)
 
138
  return {"error": f"An unexpected error occurred during audio processing: {e}"}
139
  finally:
 
140
  if temp_audio_path and os.path.exists(temp_audio_path):
141
  try:
142
  os.unlink(temp_audio_path)
143
  logger.info(f"Temporary audio file deleted: {temp_audio_path}")
144
  except Exception as e:
145
+ logger.error(f"Failed to delete temp file: {e}")
146