bilalfaye's picture
Update app.py
513c3a7 verified
import gradio as gr
import torch
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
import torchaudio
import numpy as np
device = "cuda" if torch.cuda.is_available() else "cpu"
# Modèle 1 : Traduction texte Wolof -> texte anglais
model_name = "bilalfaye/nllb-200-distilled-600M-wolof-english"
translator = pipeline("translation", model=model_name, device=device)
# Modèle 2 : Transcription audio Wolof -> texte Wolof
pipe_wolof = pipeline(
task="automatic-speech-recognition",
model="bilalfaye/wav2vec2-large-mms-1b-wolof",
processor="bilalfaye/wav2vec2-large-mms-1b-wolof",
device=device
)
# Modèle 3 : Texte anglais -> audio anglais
synthesiser_english = pipeline("text-to-speech", "microsoft/speecht5_tts")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding_english = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
speaker_embedding_wolof = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# Modèle 4 : Transcription audio anglais -> texte anglais
pipe_english = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")
# Modèle 5 : Texte anglais -> audio Wolof
synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")
# Function 1: Transcription audio Wolof -> texte Wolof
def transcribe_audio_wolof(audio):
# Load the audio with torchaudio (returns tensor and sample rate)
waveform, sample_rate = torchaudio.load(audio)
# Check if audio is stereo
if waveform.shape[0] > 1: # Stereo (2 channels)
# Convert stereo to mono: sum the left and right channels and divide by 2
mono_audio = waveform.mean(dim=0, keepdim=True)
else:
# Audio is already mono
mono_audio = waveform
# Resample to 16000 Hz if not already
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
mono_audio = resampler(mono_audio)
sample_rate = 16000
# Convert to numpy array for pipeline processing (if required)
mono_audio = mono_audio.squeeze(0).numpy()
# Pass the processed audio to the pipeline
result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate})
return result['text']
# Fonction 2 : Traduction texte Wolof -> texte anglais
def translate_wolof_to_english(wolof_text):
translated = translator(wolof_text, src_lang="wol_Latn", tgt_lang="eng_Latn")
return translated[0]['translation_text']
def translate_english_to_wolof(wolof_text):
translated = translator(wolof_text, src_lang="eng_Latn", tgt_lang="wol_Latn")
return translated[0]['translation_text']
# Fonction 3 : Texte anglais -> audio anglais ou Wolof
def text_to_speech(text, language, voice_type):
if language == "english":
synthesiser = synthesiser_english
else:
synthesiser = synthesiser_wolof
embedding = speaker_embedding_english if voice_type == "Male" else speaker_embedding_wolof
speech = synthesiser(text, forward_params={"speaker_embeddings": embedding})
return speech["sampling_rate"], speech["audio"]
# Fonction 4 : Transcription audio anglais -> texte anglais
def transcribe_audio_english(audio):
transcription = pipe_english(audio)
return transcription["text"]
# Fonction 5 : Traitement audio Wolof vers anglais
def process_audio_wolof(audio, voice_type):
wolof_text = transcribe_audio_wolof(audio)
english_text = translate_wolof_to_english(wolof_text)
audio_english = text_to_speech(english_text, "english", voice_type)
return wolof_text, english_text, audio_english
# Fonction 6 : Traitement audio anglais vers Wolof
def process_audio_english(audio, voice_type):
english_text = transcribe_audio_english(audio)
wolof_text = translate_english_to_wolof(english_text)
audio_wolof = text_to_speech(wolof_text, "wolof", voice_type)
return english_text, wolof_text, audio_wolof
# Updated Gradio Interface
iface = gr.TabbedInterface(
[
gr.Interface(
fn=process_audio_wolof,
inputs=[
gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in Wolof"),
gr.Radio(["Male", "Female"], label="Select Voice Type")
],
outputs=[
gr.Textbox(label="Texte Wolof"),
gr.Textbox(label="Texte traduit en Anglais"),
gr.Audio(label="Audio en Anglais")
],
title="Wolof vers Anglais",
description="You can upload an audio file or record using a microphone to process Wolof audio."
),
gr.Interface(
fn=process_audio_english,
inputs=[
gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in English"),
gr.Radio(["Male", "Female"], label="Select Voice Type")
],
outputs=[
gr.Textbox(label="Texte Anglais"),
gr.Textbox(label="Texte traduit en Wolof"),
gr.Audio(label="Audio en Wolof")
],
title="Anglais vers Wolof",
description="You can upload an audio file or record using a microphone to process English audio."
)
],
tab_names=["Wolof vers Anglais", "Anglais vers Wolof"]
)
iface.launch(debug=True, share=True)