import base64 import os import gradio as gr from transformers import ( pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor, ) import numpy as np import librosa from datetime import datetime from datasets import ( load_dataset, concatenate_datasets, Dataset, DatasetDict, Features, Value, Audio, ) import torch import spaces # ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- # # Hugging Face evaluation dataset HF_DATASET_NAME = "BounharAbdelaziz/Moroccan-STT-Eval-Dataset" # ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- # # Models paths MODEL_PATHS = { "NANO": "BounharAbdelaziz/Morocco-Darija-STT-tiny-v1.4", "SMALL": "BounharAbdelaziz/Moroccan-Darija-STT-small-v1.6.14", #"BounharAbdelaziz/Morocco-Darija-STT-small-v1.4", "MEDIUM": "BounharAbdelaziz/Morocco-Darija-STT-large-turbo-v1.4", "LARGE": "BounharAbdelaziz/Morocco-Darija-STT-large-v1.4", # 1.6.13 25/01 "BounharAbdelaziz/Morocco-Darija-STT-large-v1.4", #"BounharAbdelaziz/Morocco-Darija-STT-large-v1.2", } # ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- # # Access token to models STT_MODEL_TOKEN = os.environ.get("STT_MODEL_TOKEN") # Access token to dataset STT_EVAL_DATASET_TOKEN = os.environ.get("STT_EVAL_DATASET_TOKEN") # ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- # def encode_image_to_base64(image_path): with open(image_path, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode() return encoded_string # ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- # def create_html_image(image_path): img_base64 = encode_image_to_base64(image_path) html_string = f"""
Displayed Image
""" return html_string # ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- # def save_to_hf_dataset(audio_signal, model_choice, transcription): print("[INFO] Loading dataset...") dataset = load_dataset(HF_DATASET_NAME, token=STT_EVAL_DATASET_TOKEN) print("[INFO] Dataset loaded successfully.") timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") new_entry = { "audio": [{"array": audio_signal, "sampling_rate": 16000}], "transcription": [transcription], "model_used": [model_choice], "timestamp": [timestamp], } new_dataset = Dataset.from_dict( new_entry, features=Features({ "audio": Audio(sampling_rate=16000), "transcription": Value("string"), "model_used": Value("string"), "timestamp": Value("string"), }) ) print("[INFO] Adding the new entry to the dataset...") train_dataset = dataset["train"] updated_train_dataset = concatenate_datasets([train_dataset, new_dataset]) dataset["train"] = updated_train_dataset print("[INFO] Pushing the updated dataset...") dataset.push_to_hub(HF_DATASET_NAME, token=STT_EVAL_DATASET_TOKEN) print("[INFO] Dataset updated and pushed successfully.") # ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- # def load_model(model_name): device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = MODEL_PATHS[model_name.upper()] print("[INFO] Loading processor and model...") model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, token=STT_MODEL_TOKEN ) model.to(device) processor = AutoProcessor.from_pretrained(model_id, token=STT_MODEL_TOKEN) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, return_timestamps=False, #num_beams=4, torch_dtype=torch_dtype, device=device, generate_kwargs = {"task": "transcribe"}, # to make sure it always do transcription # "language":"<|ar|>", ) return pipe # return pipeline("automatic-speech-recognition", model=model_id, token=STT_MODEL_TOKEN, generate_kwargs = {"language":"<|ar|>","task": "transcribe"}) # ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- # @spaces.GPU def process_audio(audio, model_choice, save_data): # Force to false for now, issue with dataset # save_data = False pipe = load_model(model_choice) audio_signal = audio[1] sample_rate = audio[0] audio_signal = audio_signal.astype(np.float32) if np.abs(audio_signal).max() > 1.0: audio_signal = audio_signal / 32768.0 if sample_rate != 16000: print(f"[INFO] Resampling audio from {sample_rate}Hz to 16000Hz") audio_signal = librosa.resample( y=audio_signal, orig_sr=sample_rate, target_sr=16000 ) result = pipe(audio_signal) transcription = result["text"] if save_data: print(f"[INFO] Saving data to eval dataset...") save_to_hf_dataset(audio_signal, model_choice, transcription) return transcription # ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- # def create_interface(): with gr.Blocks(css="footer{display:none !important}") as app: base_path = os.path.dirname(__file__) local_image_path = os.path.join(base_path, 'logo_image.png') gr.HTML(create_html_image(local_image_path)) gr.Markdown("# πŸ‡²πŸ‡¦ πŸš€ Moroccan Fast Speech-to-Text Transcription 😍") gr.Markdown("⚠️ **Nota bene**: Make sure to click on **Stop** before hitting the **Transcribe** button") gr.Markdown("The **Large** model is now available! πŸ”₯") with gr.Row(): model_choice = gr.Dropdown( choices=["Nano", "Small", "Medium", "Large"], value="Small", label="Select one of the models" ) with gr.Row(): audio_input = gr.Audio( sources=["microphone"], type="numpy", label="Record Audio", ) with gr.Row(): save_data = gr.Checkbox( label="Contribute to the evaluation benchmark", value=True, ) submit_btn = gr.Button("Transcribe πŸ”₯") output_text = gr.Textbox(label="Transcription", text_align="right") gr.Markdown(""" ### πŸ“„πŸ“Œ Notice to our dearest users πŸ€— (coming soon) - By transcribing your audio, you’re actively contributing to the development of a benchmark evaluation dataset for Moroccan speech-to-text models. - Your transcriptions will be logged into a dedicated Hugging Face dataset, playing a crucial role in advancing research and innovation in speech recognition for Moroccan dialects and languages. - Together, we’re building tools that better understand and serve the unique linguistic landscape of Morocco. - We count on your **thoughtfulness and responsibility** when using the app. Thank you for your contribution! 🌟 """) submit_btn.click( fn=process_audio, inputs=[audio_input, model_choice, save_data], outputs=output_text ) gr.Markdown("
") return app