import base64
import os
import gradio as gr
from transformers import (
    pipeline, 
    AutoModelForSpeechSeq2Seq, 
    AutoProcessor,
)
import numpy as np
import librosa
from datetime import datetime
from datasets import (
    load_dataset, 
    concatenate_datasets,
    Dataset,
    DatasetDict,
    Features,
    Value,
    Audio,
)

import torch
import spaces

# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #

# Hugging Face evaluation dataset
HF_DATASET_NAME = "BounharAbdelaziz/Moroccan-STT-Eval-Dataset"

# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #

# Models paths
MODEL_PATHS = {
    "NANO": "BounharAbdelaziz/Morocco-Darija-STT-tiny-v1.4",
    "SMALL": "BounharAbdelaziz/Moroccan-Darija-STT-small-v1.6.14", #"BounharAbdelaziz/Morocco-Darija-STT-small-v1.4",
    "MEDIUM": "BounharAbdelaziz/Morocco-Darija-STT-large-turbo-v1.4", 
    "LARGE": "BounharAbdelaziz/Morocco-Darija-STT-large-v1.4", # 1.6.13 25/01 "BounharAbdelaziz/Morocco-Darija-STT-large-v1.4", #"BounharAbdelaziz/Morocco-Darija-STT-large-v1.2",
}

# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #

# Access token to models
STT_MODEL_TOKEN = os.environ.get("STT_MODEL_TOKEN")

# Access token to dataset
STT_EVAL_DATASET_TOKEN = os.environ.get("STT_EVAL_DATASET_TOKEN")

# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #

def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read()).decode()
    return encoded_string

# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #

def create_html_image(image_path):
    img_base64 = encode_image_to_base64(image_path)
    html_string = f"""
    <div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
        <div style="max-width: 800px; margin: auto;">
            <img src="data:image/jpeg;base64,{img_base64}"
                 style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
                 alt="Displayed Image">
        </div>
    </div>
    """
    return html_string

# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #

def save_to_hf_dataset(audio_signal, model_choice, transcription):
    print("[INFO] Loading dataset...")

    dataset = load_dataset(HF_DATASET_NAME, token=STT_EVAL_DATASET_TOKEN)
    print("[INFO] Dataset loaded successfully.")

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    new_entry = {
        "audio": [{"array": audio_signal, "sampling_rate": 16000}],
        "transcription": [transcription],
        "model_used": [model_choice],
        "timestamp": [timestamp],
    }

    new_dataset = Dataset.from_dict(
        new_entry,
        features=Features({
            "audio": Audio(sampling_rate=16000),
            "transcription": Value("string"),
            "model_used": Value("string"),
            "timestamp": Value("string"),
        })
    )

    print("[INFO] Adding the new entry to the dataset...")
    train_dataset = dataset["train"]
    updated_train_dataset = concatenate_datasets([train_dataset, new_dataset])
    dataset["train"] = updated_train_dataset

    print("[INFO] Pushing the updated dataset...")
    dataset.push_to_hub(HF_DATASET_NAME, token=STT_EVAL_DATASET_TOKEN)

    print("[INFO] Dataset updated and pushed successfully.")

# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #

def load_model(model_name):

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    model_id = MODEL_PATHS[model_name.upper()]
    
    print("[INFO] Loading processor and model...")
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, token=STT_MODEL_TOKEN
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id, token=STT_MODEL_TOKEN)

    
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        return_timestamps=False,
        #num_beams=4,
        torch_dtype=torch_dtype,
        device=device,
        generate_kwargs = {"task": "transcribe"}, # to make sure it always do transcription # "language":"<|ar|>",
    )

    return pipe
    # return pipeline("automatic-speech-recognition", model=model_id, token=STT_MODEL_TOKEN, generate_kwargs = {"language":"<|ar|>","task": "transcribe"})

# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #

@spaces.GPU
def process_audio(audio, model_choice, save_data):
    
    # Force to false for now, issue with dataset
    # save_data = False
    pipe = load_model(model_choice)
    audio_signal = audio[1]
    sample_rate = audio[0]
    audio_signal = audio_signal.astype(np.float32)
    
    if np.abs(audio_signal).max() > 1.0:
        audio_signal = audio_signal / 32768.0
    
    if sample_rate != 16000:
        print(f"[INFO] Resampling audio from {sample_rate}Hz to 16000Hz")
        audio_signal = librosa.resample(
            y=audio_signal, 
            orig_sr=sample_rate,
            target_sr=16000
        )
    
    result = pipe(audio_signal)
    transcription = result["text"]
    
    if save_data:
        print(f"[INFO] Saving data to eval dataset...")
        save_to_hf_dataset(audio_signal, model_choice, transcription)
    
    return transcription

# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #

def create_interface():
    with gr.Blocks(css="footer{display:none !important}") as app:
        base_path = os.path.dirname(__file__)
        local_image_path = os.path.join(base_path, 'logo_image.png')
        gr.HTML(create_html_image(local_image_path))
        
        gr.Markdown("# 🇲🇦 🚀 Moroccan Fast Speech-to-Text Transcription 😍")

        gr.Markdown("⚠️ **Nota bene**: Make sure to click on **Stop** before hitting the **Transcribe** button")
        gr.Markdown("The **Large** model is now available! 🔥")
        
        with gr.Row():
            model_choice = gr.Dropdown(
                choices=["Nano", "Small", "Medium", "Large"],
                value="Small",
                label="Select one of the models"
            )
        
        with gr.Row():
            audio_input = gr.Audio(
                sources=["microphone"],
                type="numpy",
                label="Record Audio",
            )
        
        with gr.Row():
            save_data = gr.Checkbox(
                label="Contribute to the evaluation benchmark",
                value=True,
            )
        
        submit_btn = gr.Button("Transcribe 🔥")
        output_text = gr.Textbox(label="Transcription", text_align="right")

        
        gr.Markdown("""
        ### 📄📌 Notice to our dearest users 🤗 (coming soon)
        - By transcribing your audio, you’re actively contributing to the development of a benchmark evaluation dataset for Moroccan speech-to-text models.  
        - Your transcriptions will be logged into a dedicated Hugging Face dataset, playing a crucial role in advancing research and innovation in speech recognition for Moroccan dialects and languages.  
        - Together, we’re building tools that better understand and serve the unique linguistic landscape of Morocco.
        - We count on your **thoughtfulness and responsibility** when using the app. Thank you for your contribution! 🌟
        """)
        
        submit_btn.click(
            fn=process_audio,
            inputs=[audio_input, model_choice, save_data],
            outputs=output_text
        )
        
        gr.Markdown("<br/>")
    
    return app