File size: 5,469 Bytes
fc00984 dafa82f fc00984 513c3a7 fc00984 dafa82f fc00984 dafa82f fc00984 dafa82f fc00984 dafa82f fc00984 dafa82f fc00984 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import gradio as gr
import torch
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
import torchaudio
import numpy as np
device = "cuda" if torch.cuda.is_available() else "cpu"
# Modèle 1 : Traduction texte Wolof -> texte anglais
model_name = "bilalfaye/nllb-200-distilled-600M-wolof-english"
translator = pipeline("translation", model=model_name, device=device)
# Modèle 2 : Transcription audio Wolof -> texte Wolof
pipe_wolof = pipeline(
task="automatic-speech-recognition",
model="bilalfaye/wav2vec2-large-mms-1b-wolof",
processor="bilalfaye/wav2vec2-large-mms-1b-wolof",
device=device
)
# Modèle 3 : Texte anglais -> audio anglais
synthesiser_english = pipeline("text-to-speech", "microsoft/speecht5_tts")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding_english = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
speaker_embedding_wolof = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# Modèle 4 : Transcription audio anglais -> texte anglais
pipe_english = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")
# Modèle 5 : Texte anglais -> audio Wolof
synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")
# Function 1: Transcription audio Wolof -> texte Wolof
def transcribe_audio_wolof(audio):
# Load the audio with torchaudio (returns tensor and sample rate)
waveform, sample_rate = torchaudio.load(audio)
# Check if audio is stereo
if waveform.shape[0] > 1: # Stereo (2 channels)
# Convert stereo to mono: sum the left and right channels and divide by 2
mono_audio = waveform.mean(dim=0, keepdim=True)
else:
# Audio is already mono
mono_audio = waveform
# Resample to 16000 Hz if not already
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
mono_audio = resampler(mono_audio)
sample_rate = 16000
# Convert to numpy array for pipeline processing (if required)
mono_audio = mono_audio.squeeze(0).numpy()
# Pass the processed audio to the pipeline
result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate})
return result['text']
# Fonction 2 : Traduction texte Wolof -> texte anglais
def translate_wolof_to_english(wolof_text):
translated = translator(wolof_text, src_lang="wol_Latn", tgt_lang="eng_Latn")
return translated[0]['translation_text']
def translate_english_to_wolof(wolof_text):
translated = translator(wolof_text, src_lang="eng_Latn", tgt_lang="wol_Latn")
return translated[0]['translation_text']
# Fonction 3 : Texte anglais -> audio anglais ou Wolof
def text_to_speech(text, language, voice_type):
if language == "english":
synthesiser = synthesiser_english
else:
synthesiser = synthesiser_wolof
embedding = speaker_embedding_english if voice_type == "Male" else speaker_embedding_wolof
speech = synthesiser(text, forward_params={"speaker_embeddings": embedding})
return speech["sampling_rate"], speech["audio"]
# Fonction 4 : Transcription audio anglais -> texte anglais
def transcribe_audio_english(audio):
transcription = pipe_english(audio)
return transcription["text"]
# Fonction 5 : Traitement audio Wolof vers anglais
def process_audio_wolof(audio, voice_type):
wolof_text = transcribe_audio_wolof(audio)
english_text = translate_wolof_to_english(wolof_text)
audio_english = text_to_speech(english_text, "english", voice_type)
return wolof_text, english_text, audio_english
# Fonction 6 : Traitement audio anglais vers Wolof
def process_audio_english(audio, voice_type):
english_text = transcribe_audio_english(audio)
wolof_text = translate_english_to_wolof(english_text)
audio_wolof = text_to_speech(wolof_text, "wolof", voice_type)
return english_text, wolof_text, audio_wolof
# Updated Gradio Interface
iface = gr.TabbedInterface(
[
gr.Interface(
fn=process_audio_wolof,
inputs=[
gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in Wolof"),
gr.Radio(["Male", "Female"], label="Select Voice Type")
],
outputs=[
gr.Textbox(label="Texte Wolof"),
gr.Textbox(label="Texte traduit en Anglais"),
gr.Audio(label="Audio en Anglais")
],
title="Wolof vers Anglais",
description="You can upload an audio file or record using a microphone to process Wolof audio."
),
gr.Interface(
fn=process_audio_english,
inputs=[
gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in English"),
gr.Radio(["Male", "Female"], label="Select Voice Type")
],
outputs=[
gr.Textbox(label="Texte Anglais"),
gr.Textbox(label="Texte traduit en Wolof"),
gr.Audio(label="Audio en Wolof")
],
title="Anglais vers Wolof",
description="You can upload an audio file or record using a microphone to process English audio."
)
],
tab_names=["Wolof vers Anglais", "Anglais vers Wolof"]
)
iface.launch(debug=True, share=True) |