bilalfaye commited on
Commit
dafa82f
·
verified ·
1 Parent(s): f403ffa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -12
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import torch
3
  from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
4
  from datasets import load_dataset
5
- import soundfile as sf
6
  import numpy as np
7
 
8
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -29,9 +29,7 @@ speaker_embedding_english = torch.tensor(embeddings_dataset[0]["xvector"]).unsqu
29
  speaker_embedding_wolof = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
30
 
31
  # Modèle 4 : Transcription audio anglais -> texte anglais
32
- processor_whisper = WhisperProcessor.from_pretrained("openai/whisper-small")
33
- model_whisper = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
34
- model_whisper.to(device)
35
 
36
  # Modèle 5 : Texte anglais -> audio Wolof
37
  synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")
@@ -39,11 +37,35 @@ synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")
39
 
40
 
41
 
42
- # Fonction 1 : Transcription audio Wolof -> texte Wolof
 
43
  def transcribe_audio_wolof(audio):
44
- result = pipe_wolof(audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  return result['text']
46
 
 
 
47
  # Fonction 2 : Traduction texte Wolof -> texte anglais
48
  def translate_wolof_to_english(wolof_text):
49
  translated = translator(wolof_text, src_lang="wol_Latn", tgt_lang="eng_Latn")
@@ -67,12 +89,8 @@ def text_to_speech(text, language, voice_type):
67
 
68
  # Fonction 4 : Transcription audio anglais -> texte anglais
69
  def transcribe_audio_english(audio):
70
- audio_data, samplerate = sf.read(audio)
71
- audio_data = audio_data.astype(np.float32)
72
- input_features = processor_whisper(audio_data, sampling_rate=16000, return_tensors="pt").input_features.to(device)
73
- predicted_ids = model_whisper.generate(input_features)
74
- transcription = processor_whisper.batch_decode(predicted_ids, skip_special_tokens=True)
75
- return transcription[0]
76
 
77
  # Fonction 5 : Traitement audio Wolof vers anglais
78
  def process_audio_wolof(audio, voice_type):
 
2
  import torch
3
  from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
4
  from datasets import load_dataset
5
+ import torchaudio
6
  import numpy as np
7
 
8
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
29
  speaker_embedding_wolof = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
30
 
31
  # Modèle 4 : Transcription audio anglais -> texte anglais
32
+ pipe_english = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")
 
 
33
 
34
  # Modèle 5 : Texte anglais -> audio Wolof
35
  synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")
 
37
 
38
 
39
 
40
+
41
+ # Function 1: Transcription audio Wolof -> texte Wolof
42
  def transcribe_audio_wolof(audio):
43
+ # Load the audio with torchaudio (returns tensor and sample rate)
44
+ waveform, sample_rate = torchaudio.load(audio)
45
+
46
+ # Check if audio is stereo
47
+ if waveform.shape[0] > 1: # Stereo (2 channels)
48
+ # Convert stereo to mono: sum the left and right channels and divide by 2
49
+ mono_audio = waveform.mean(dim=0, keepdim=True)
50
+ else:
51
+ # Audio is already mono
52
+ mono_audio = waveform
53
+
54
+ # Resample to 16000 Hz if not already
55
+ if sample_rate != 16000:
56
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
57
+ mono_audio = resampler(mono_audio)
58
+ sample_rate = 16000
59
+
60
+ # Convert to numpy array for pipeline processing (if required)
61
+ mono_audio = mono_audio.squeeze(0).numpy()
62
+
63
+ # Pass the processed audio to the pipeline
64
+ result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate})
65
  return result['text']
66
 
67
+
68
+
69
  # Fonction 2 : Traduction texte Wolof -> texte anglais
70
  def translate_wolof_to_english(wolof_text):
71
  translated = translator(wolof_text, src_lang="wol_Latn", tgt_lang="eng_Latn")
 
89
 
90
  # Fonction 4 : Transcription audio anglais -> texte anglais
91
  def transcribe_audio_english(audio):
92
+ transcription = pipe_english(audio)
93
+ return transcription["text"]
 
 
 
 
94
 
95
  # Fonction 5 : Traitement audio Wolof vers anglais
96
  def process_audio_wolof(audio, voice_type):