bilalfaye commited on
Commit
fc00984
·
verified ·
1 Parent(s): 065ebce

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -0
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
4
+ from datasets import load_dataset
5
+ import soundfile as sf
6
+ import numpy as np
7
+
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+
10
+
11
+
12
+
13
+ # Modèle 1 : Traduction texte Wolof -> texte anglais
14
+ model_name = "bilalfaye/nllb-200-distilled-600M-en-wo"
15
+ translator = pipeline("translation", model=model_name, device=device)
16
+
17
+ # Modèle 2 : Transcription audio Wolof -> texte Wolof
18
+ pipe_wolof = pipeline(
19
+ task="automatic-speech-recognition",
20
+ model="bilalfaye/wav2vec2-large-mms-1b-wolof",
21
+ processor="bilalfaye/wav2vec2-large-mms-1b-wolof",
22
+ device=device
23
+ )
24
+
25
+ # Modèle 3 : Texte anglais -> audio anglais
26
+ synthesiser_english = pipeline("text-to-speech", "microsoft/speecht5_tts")
27
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
28
+ speaker_embedding_english = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
29
+ speaker_embedding_wolof = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
30
+
31
+ # Modèle 4 : Transcription audio anglais -> texte anglais
32
+ processor_whisper = WhisperProcessor.from_pretrained("openai/whisper-small")
33
+ model_whisper = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
34
+ model_whisper.to(device)
35
+
36
+ # Modèle 5 : Texte anglais -> audio Wolof
37
+ synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")
38
+
39
+
40
+
41
+
42
+ # Fonction 1 : Transcription audio Wolof -> texte Wolof
43
+ def transcribe_audio_wolof(audio):
44
+ result = pipe_wolof(audio)
45
+ return result['text']
46
+
47
+ # Fonction 2 : Traduction texte Wolof -> texte anglais
48
+ def translate_wolof_to_english(wolof_text):
49
+ translated = translator(wolof_text, src_lang="wol_Latn", tgt_lang="eng_Latn")
50
+ return translated[0]['translation_text']
51
+
52
+ def translate_english_to_wolof(wolof_text):
53
+ translated = translator(wolof_text, src_lang="eng_Latn", tgt_lang="wol_Latn")
54
+ return translated[0]['translation_text']
55
+
56
+
57
+ # Fonction 3 : Texte anglais -> audio anglais ou Wolof
58
+ def text_to_speech(text, language, voice_type):
59
+ if language == "english":
60
+ synthesiser = synthesiser_english
61
+ else:
62
+ synthesiser = synthesiser_wolof
63
+
64
+ embedding = speaker_embedding_english if voice_type == "Male" else speaker_embedding_wolof
65
+ speech = synthesiser(text, forward_params={"speaker_embeddings": embedding})
66
+ return speech["sampling_rate"], speech["audio"]
67
+
68
+ # Fonction 4 : Transcription audio anglais -> texte anglais
69
+ def transcribe_audio_english(audio):
70
+ audio_data, samplerate = sf.read(audio)
71
+ audio_data = audio_data.astype(np.float32)
72
+ input_features = processor_whisper(audio_data, sampling_rate=16000, return_tensors="pt").input_features.to(device)
73
+ predicted_ids = model_whisper.generate(input_features)
74
+ transcription = processor_whisper.batch_decode(predicted_ids, skip_special_tokens=True)
75
+ return transcription[0]
76
+
77
+ # Fonction 5 : Traitement audio Wolof vers anglais
78
+ def process_audio_wolof(audio, voice_type):
79
+ wolof_text = transcribe_audio_wolof(audio)
80
+ english_text = translate_wolof_to_english(wolof_text)
81
+ audio_english = text_to_speech(english_text, "english", voice_type)
82
+ return wolof_text, english_text, audio_english
83
+
84
+ # Fonction 6 : Traitement audio anglais vers Wolof
85
+ def process_audio_english(audio, voice_type):
86
+ english_text = transcribe_audio_english(audio)
87
+ wolof_text = translate_english_to_wolof(english_text)
88
+ audio_wolof = text_to_speech(wolof_text, "wolof", voice_type)
89
+ return english_text, wolof_text, audio_wolof
90
+
91
+ # Updated Gradio Interface
92
+ iface = gr.TabbedInterface(
93
+ [
94
+ gr.Interface(
95
+ fn=process_audio_wolof,
96
+ inputs=[
97
+ gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in Wolof"),
98
+ gr.Radio(["Male", "Female"], label="Select Voice Type")
99
+ ],
100
+ outputs=[
101
+ gr.Textbox(label="Texte Wolof"),
102
+ gr.Textbox(label="Texte traduit en Anglais"),
103
+ gr.Audio(label="Audio en Anglais")
104
+ ],
105
+ title="Wolof vers Anglais",
106
+ description="You can upload an audio file or record using a microphone to process Wolof audio."
107
+ ),
108
+ gr.Interface(
109
+ fn=process_audio_english,
110
+ inputs=[
111
+ gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in English"),
112
+ gr.Radio(["Male", "Female"], label="Select Voice Type")
113
+ ],
114
+ outputs=[
115
+ gr.Textbox(label="Texte Anglais"),
116
+ gr.Textbox(label="Texte traduit en Wolof"),
117
+ gr.Audio(label="Audio en Wolof")
118
+ ],
119
+ title="Anglais vers Wolof",
120
+ description="You can upload an audio file or record using a microphone to process English audio."
121
+ )
122
+ ],
123
+ tab_names=["Wolof vers Anglais", "Anglais vers Wolof"]
124
+ )
125
+
126
+ iface.launch(debug=True, share=True)