|
import gradio as gr |
|
import torch |
|
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration |
|
from datasets import load_dataset |
|
import torchaudio |
|
import numpy as np |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
|
|
|
|
|
|
model_name = "bilalfaye/nllb-200-distilled-600M-wolof-english" |
|
translator = pipeline("translation", model=model_name, device=device) |
|
|
|
|
|
pipe_wolof = pipeline( |
|
task="automatic-speech-recognition", |
|
model="bilalfaye/wav2vec2-large-mms-1b-wolof", |
|
processor="bilalfaye/wav2vec2-large-mms-1b-wolof", |
|
device=device |
|
) |
|
|
|
|
|
synthesiser_english = pipeline("text-to-speech", "microsoft/speecht5_tts") |
|
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") |
|
speaker_embedding_english = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0) |
|
speaker_embedding_wolof = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) |
|
|
|
|
|
pipe_english = pipeline(task="automatic-speech-recognition", model="openai/whisper-small") |
|
|
|
|
|
synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof") |
|
|
|
|
|
|
|
|
|
|
|
|
|
def transcribe_audio_wolof(audio): |
|
|
|
waveform, sample_rate = torchaudio.load(audio) |
|
|
|
|
|
if waveform.shape[0] > 1: |
|
|
|
mono_audio = waveform.mean(dim=0, keepdim=True) |
|
else: |
|
|
|
mono_audio = waveform |
|
|
|
|
|
if sample_rate != 16000: |
|
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) |
|
mono_audio = resampler(mono_audio) |
|
sample_rate = 16000 |
|
|
|
|
|
mono_audio = mono_audio.squeeze(0).numpy() |
|
|
|
|
|
result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate}) |
|
return result['text'] |
|
|
|
|
|
|
|
|
|
def translate_wolof_to_english(wolof_text): |
|
translated = translator(wolof_text, src_lang="wol_Latn", tgt_lang="eng_Latn") |
|
return translated[0]['translation_text'] |
|
|
|
def translate_english_to_wolof(wolof_text): |
|
translated = translator(wolof_text, src_lang="eng_Latn", tgt_lang="wol_Latn") |
|
return translated[0]['translation_text'] |
|
|
|
|
|
|
|
def text_to_speech(text, language, voice_type): |
|
if language == "english": |
|
synthesiser = synthesiser_english |
|
else: |
|
synthesiser = synthesiser_wolof |
|
|
|
embedding = speaker_embedding_english if voice_type == "Male" else speaker_embedding_wolof |
|
speech = synthesiser(text, forward_params={"speaker_embeddings": embedding}) |
|
return speech["sampling_rate"], speech["audio"] |
|
|
|
|
|
def transcribe_audio_english(audio): |
|
transcription = pipe_english(audio) |
|
return transcription["text"] |
|
|
|
|
|
def process_audio_wolof(audio, voice_type): |
|
wolof_text = transcribe_audio_wolof(audio) |
|
english_text = translate_wolof_to_english(wolof_text) |
|
audio_english = text_to_speech(english_text, "english", voice_type) |
|
return wolof_text, english_text, audio_english |
|
|
|
|
|
def process_audio_english(audio, voice_type): |
|
english_text = transcribe_audio_english(audio) |
|
wolof_text = translate_english_to_wolof(english_text) |
|
audio_wolof = text_to_speech(wolof_text, "wolof", voice_type) |
|
return english_text, wolof_text, audio_wolof |
|
|
|
|
|
iface = gr.TabbedInterface( |
|
[ |
|
gr.Interface( |
|
fn=process_audio_wolof, |
|
inputs=[ |
|
gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in Wolof"), |
|
gr.Radio(["Male", "Female"], label="Select Voice Type") |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Texte Wolof"), |
|
gr.Textbox(label="Texte traduit en Anglais"), |
|
gr.Audio(label="Audio en Anglais") |
|
], |
|
title="Wolof vers Anglais", |
|
description="You can upload an audio file or record using a microphone to process Wolof audio." |
|
), |
|
gr.Interface( |
|
fn=process_audio_english, |
|
inputs=[ |
|
gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in English"), |
|
gr.Radio(["Male", "Female"], label="Select Voice Type") |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Texte Anglais"), |
|
gr.Textbox(label="Texte traduit en Wolof"), |
|
gr.Audio(label="Audio en Wolof") |
|
], |
|
title="Anglais vers Wolof", |
|
description="You can upload an audio file or record using a microphone to process English audio." |
|
) |
|
], |
|
tab_names=["Wolof vers Anglais", "Anglais vers Wolof"] |
|
) |
|
|
|
iface.launch(debug=True, share=True) |