File size: 5,469 Bytes
fc00984
 
 
 
dafa82f
fc00984
 
 
 
 
 
 
 
513c3a7
fc00984
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dafa82f
fc00984
 
 
 
 
 
 
dafa82f
 
fc00984
dafa82f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc00984
 
dafa82f
 
fc00984
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dafa82f
 
fc00984
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import gradio as gr
import torch
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
import torchaudio
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"




# Modèle 1 : Traduction texte Wolof -> texte anglais
model_name = "bilalfaye/nllb-200-distilled-600M-wolof-english"
translator = pipeline("translation", model=model_name, device=device)

# Modèle 2 : Transcription audio Wolof -> texte Wolof
pipe_wolof = pipeline(
    task="automatic-speech-recognition",
    model="bilalfaye/wav2vec2-large-mms-1b-wolof",
    processor="bilalfaye/wav2vec2-large-mms-1b-wolof",
    device=device
)

# Modèle 3 : Texte anglais -> audio anglais
synthesiser_english = pipeline("text-to-speech", "microsoft/speecht5_tts")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding_english = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
speaker_embedding_wolof = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# Modèle 4 : Transcription audio anglais -> texte anglais
pipe_english = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")

# Modèle 5 : Texte anglais -> audio Wolof
synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")





# Function 1: Transcription audio Wolof -> texte Wolof
def transcribe_audio_wolof(audio):
    # Load the audio with torchaudio (returns tensor and sample rate)
    waveform, sample_rate = torchaudio.load(audio)
    
    # Check if audio is stereo
    if waveform.shape[0] > 1:  # Stereo (2 channels)
        # Convert stereo to mono: sum the left and right channels and divide by 2
        mono_audio = waveform.mean(dim=0, keepdim=True)
    else:
        # Audio is already mono
        mono_audio = waveform
    
    # Resample to 16000 Hz if not already
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        mono_audio = resampler(mono_audio)
        sample_rate = 16000

    # Convert to numpy array for pipeline processing (if required)
    mono_audio = mono_audio.squeeze(0).numpy()

    # Pass the processed audio to the pipeline
    result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate})
    return result['text']



# Fonction 2 : Traduction texte Wolof -> texte anglais
def translate_wolof_to_english(wolof_text):
    translated = translator(wolof_text, src_lang="wol_Latn", tgt_lang="eng_Latn")
    return translated[0]['translation_text']

def translate_english_to_wolof(wolof_text):
    translated = translator(wolof_text, src_lang="eng_Latn", tgt_lang="wol_Latn")
    return translated[0]['translation_text']


# Fonction 3 : Texte anglais -> audio anglais ou Wolof
def text_to_speech(text, language, voice_type):
    if language == "english":
        synthesiser = synthesiser_english
    else:
        synthesiser = synthesiser_wolof

    embedding = speaker_embedding_english if voice_type == "Male" else speaker_embedding_wolof
    speech = synthesiser(text, forward_params={"speaker_embeddings": embedding})
    return speech["sampling_rate"], speech["audio"]

# Fonction 4 : Transcription audio anglais -> texte anglais
def transcribe_audio_english(audio):
    transcription = pipe_english(audio)
    return transcription["text"]

# Fonction 5 : Traitement audio Wolof vers anglais
def process_audio_wolof(audio, voice_type):
    wolof_text = transcribe_audio_wolof(audio)
    english_text = translate_wolof_to_english(wolof_text)
    audio_english = text_to_speech(english_text, "english", voice_type)
    return wolof_text, english_text, audio_english

# Fonction 6 : Traitement audio anglais vers Wolof
def process_audio_english(audio, voice_type):
    english_text = transcribe_audio_english(audio)
    wolof_text = translate_english_to_wolof(english_text)
    audio_wolof = text_to_speech(wolof_text, "wolof", voice_type)
    return english_text, wolof_text, audio_wolof

# Updated Gradio Interface
iface = gr.TabbedInterface(
    [
        gr.Interface(
            fn=process_audio_wolof,
            inputs=[
                gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in Wolof"),
                gr.Radio(["Male", "Female"], label="Select Voice Type")
            ],
            outputs=[
                gr.Textbox(label="Texte Wolof"),
                gr.Textbox(label="Texte traduit en Anglais"),
                gr.Audio(label="Audio en Anglais")
            ],
            title="Wolof vers Anglais",
            description="You can upload an audio file or record using a microphone to process Wolof audio."
        ),
        gr.Interface(
            fn=process_audio_english,
            inputs=[
                gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in English"),
                gr.Radio(["Male", "Female"], label="Select Voice Type")
            ],
            outputs=[
                gr.Textbox(label="Texte Anglais"),
                gr.Textbox(label="Texte traduit en Wolof"),
                gr.Audio(label="Audio en Wolof")
            ],
            title="Anglais vers Wolof",
            description="You can upload an audio file or record using a microphone to process English audio."
        )
    ],
    tab_names=["Wolof vers Anglais", "Anglais vers Wolof"]
)

iface.launch(debug=True, share=True)