Spaces:

bilalfaye
/

wolof_english_translation

Running

App Files Files Community

wolof_english_translation / app.py

bilalfaye

Update app.py

513c3a7 verified about 2 months ago

raw

history blame contribute delete

5.47 kB

	import gradio as gr
	import torch
	from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
	from datasets import load_dataset
	import torchaudio
	import numpy as np

	device = "cuda" if torch.cuda.is_available() else "cpu"




	# Modèle 1 : Traduction texte Wolof -> texte anglais
	model_name = "bilalfaye/nllb-200-distilled-600M-wolof-english"
	translator = pipeline("translation", model=model_name, device=device)

	# Modèle 2 : Transcription audio Wolof -> texte Wolof
	pipe_wolof = pipeline(
	task="automatic-speech-recognition",
	model="bilalfaye/wav2vec2-large-mms-1b-wolof",
	processor="bilalfaye/wav2vec2-large-mms-1b-wolof",
	device=device
	)

	# Modèle 3 : Texte anglais -> audio anglais
	synthesiser_english = pipeline("text-to-speech", "microsoft/speecht5_tts")
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embedding_english = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
	speaker_embedding_wolof = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

	# Modèle 4 : Transcription audio anglais -> texte anglais
	pipe_english = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")

	# Modèle 5 : Texte anglais -> audio Wolof
	synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")





	# Function 1: Transcription audio Wolof -> texte Wolof
	def transcribe_audio_wolof(audio):
	# Load the audio with torchaudio (returns tensor and sample rate)
	waveform, sample_rate = torchaudio.load(audio)

	# Check if audio is stereo
	if waveform.shape[0] > 1: # Stereo (2 channels)
	# Convert stereo to mono: sum the left and right channels and divide by 2
	mono_audio = waveform.mean(dim=0, keepdim=True)
	else:
	# Audio is already mono
	mono_audio = waveform

	# Resample to 16000 Hz if not already
	if sample_rate != 16000:
	resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
	mono_audio = resampler(mono_audio)
	sample_rate = 16000

	# Convert to numpy array for pipeline processing (if required)
	mono_audio = mono_audio.squeeze(0).numpy()

	# Pass the processed audio to the pipeline
	result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate})
	return result['text']



	# Fonction 2 : Traduction texte Wolof -> texte anglais
	def translate_wolof_to_english(wolof_text):
	translated = translator(wolof_text, src_lang="wol_Latn", tgt_lang="eng_Latn")
	return translated[0]['translation_text']

	def translate_english_to_wolof(wolof_text):
	translated = translator(wolof_text, src_lang="eng_Latn", tgt_lang="wol_Latn")
	return translated[0]['translation_text']


	# Fonction 3 : Texte anglais -> audio anglais ou Wolof
	def text_to_speech(text, language, voice_type):
	if language == "english":
	synthesiser = synthesiser_english
	else:
	synthesiser = synthesiser_wolof

	embedding = speaker_embedding_english if voice_type == "Male" else speaker_embedding_wolof
	speech = synthesiser(text, forward_params={"speaker_embeddings": embedding})
	return speech["sampling_rate"], speech["audio"]

	# Fonction 4 : Transcription audio anglais -> texte anglais
	def transcribe_audio_english(audio):
	transcription = pipe_english(audio)
	return transcription["text"]

	# Fonction 5 : Traitement audio Wolof vers anglais
	def process_audio_wolof(audio, voice_type):
	wolof_text = transcribe_audio_wolof(audio)
	english_text = translate_wolof_to_english(wolof_text)
	audio_english = text_to_speech(english_text, "english", voice_type)
	return wolof_text, english_text, audio_english

	# Fonction 6 : Traitement audio anglais vers Wolof
	def process_audio_english(audio, voice_type):
	english_text = transcribe_audio_english(audio)
	wolof_text = translate_english_to_wolof(english_text)
	audio_wolof = text_to_speech(wolof_text, "wolof", voice_type)
	return english_text, wolof_text, audio_wolof

	# Updated Gradio Interface
	iface = gr.TabbedInterface(
	[
	gr.Interface(
	fn=process_audio_wolof,
	inputs=[
	gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in Wolof"),
	gr.Radio(["Male", "Female"], label="Select Voice Type")
	],
	outputs=[
	gr.Textbox(label="Texte Wolof"),
	gr.Textbox(label="Texte traduit en Anglais"),
	gr.Audio(label="Audio en Anglais")
	],
	title="Wolof vers Anglais",
	description="You can upload an audio file or record using a microphone to process Wolof audio."
	),
	gr.Interface(
	fn=process_audio_english,
	inputs=[
	gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in English"),
	gr.Radio(["Male", "Female"], label="Select Voice Type")
	],
	outputs=[
	gr.Textbox(label="Texte Anglais"),
	gr.Textbox(label="Texte traduit en Wolof"),
	gr.Audio(label="Audio en Wolof")
	],
	title="Anglais vers Wolof",
	description="You can upload an audio file or record using a microphone to process English audio."
	)
	],
	tab_names=["Wolof vers Anglais", "Anglais vers Wolof"]
	)

	iface.launch(debug=True, share=True)