Spaces:

bilalfaye
/

wolof_english_translation

Sleeping

App Files Files Community

bilalfaye commited on Jan 16

Commit

dafa82f

verified ·

1 Parent(s): f403ffa

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -12

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import torch
 from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
 from datasets import load_dataset
-import soundfile as sf
 import numpy as np
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -29,9 +29,7 @@ speaker_embedding_english = torch.tensor(embeddings_dataset[0]["xvector"]).unsqu
 speaker_embedding_wolof = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 # Modèle 4 : Transcription audio anglais -> texte anglais
-processor_whisper = WhisperProcessor.from_pretrained("openai/whisper-small")
-model_whisper = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
-model_whisper.to(device)
 # Modèle 5 : Texte anglais -> audio Wolof
 synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")
@@ -39,11 +37,35 @@ synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")
-# Fonction 1 : Transcription audio Wolof -> texte Wolof
 def transcribe_audio_wolof(audio):
-    result = pipe_wolof(audio)
     return result['text']
 # Fonction 2 : Traduction texte Wolof -> texte anglais
 def translate_wolof_to_english(wolof_text):
     translated = translator(wolof_text, src_lang="wol_Latn", tgt_lang="eng_Latn")
@@ -67,12 +89,8 @@ def text_to_speech(text, language, voice_type):
 # Fonction 4 : Transcription audio anglais -> texte anglais
 def transcribe_audio_english(audio):
-    audio_data, samplerate = sf.read(audio)
-    audio_data = audio_data.astype(np.float32)
-    input_features = processor_whisper(audio_data, sampling_rate=16000, return_tensors="pt").input_features.to(device)
-    predicted_ids = model_whisper.generate(input_features)
-    transcription = processor_whisper.batch_decode(predicted_ids, skip_special_tokens=True)
-    return transcription[0]
 # Fonction 5 : Traitement audio Wolof vers anglais
 def process_audio_wolof(audio, voice_type):

 import torch
 from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
 from datasets import load_dataset
+import torchaudio
 import numpy as np
 device = "cuda" if torch.cuda.is_available() else "cpu"
 speaker_embedding_wolof = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 # Modèle 4 : Transcription audio anglais -> texte anglais
+pipe_english = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")
 # Modèle 5 : Texte anglais -> audio Wolof
 synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")
+# Function 1: Transcription audio Wolof -> texte Wolof
 def transcribe_audio_wolof(audio):
+    # Load the audio with torchaudio (returns tensor and sample rate)
+    waveform, sample_rate = torchaudio.load(audio)
+    # Check if audio is stereo
+    if waveform.shape[0] > 1:  # Stereo (2 channels)
+        # Convert stereo to mono: sum the left and right channels and divide by 2
+        mono_audio = waveform.mean(dim=0, keepdim=True)
+    else:
+        # Audio is already mono
+        mono_audio = waveform
+    # Resample to 16000 Hz if not already
+    if sample_rate != 16000:
+        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+        mono_audio = resampler(mono_audio)
+        sample_rate = 16000
+    # Convert to numpy array for pipeline processing (if required)
+    mono_audio = mono_audio.squeeze(0).numpy()
+    # Pass the processed audio to the pipeline
+    result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate})
     return result['text']
 # Fonction 2 : Traduction texte Wolof -> texte anglais
 def translate_wolof_to_english(wolof_text):
     translated = translator(wolof_text, src_lang="wol_Latn", tgt_lang="eng_Latn")
 # Fonction 4 : Transcription audio anglais -> texte anglais
 def transcribe_audio_english(audio):
+    transcription = pipe_english(audio)
+    return transcription["text"]
 # Fonction 5 : Traitement audio Wolof vers anglais
 def process_audio_wolof(audio, voice_type):