import subprocess
import os
import sys
import shutil
from pathlib import Path
import argparse
import gradio as gr
from STT.sst import speech_to_text
from LLM.llm import generate_reply
from TTS_X.tts import generate_voice
from FantasyTalking.infer import load_models, main


# downloading of models if didn't exist
if not os.path.exists("./models/fantasytalking_model.ckpt"):
    subprocess.run(["python", "download_models.py"])


sys.path.append(os.path.abspath("."))

args_template = argparse.Namespace(
    fantasytalking_model_path="./models/fantasytalking_model.ckpt",
    wav2vec_model_dir="./models/wav2vec2-base-960h",
    wan_model_dir="./models/Wan2.1-I2V-14B-720P",
    image_path="",
    audio_path="",
    prompt="",
    output_dir="./output",
    image_size=512,
    audio_scale=1.0,
    prompt_cfg_scale=5.0,
    audio_cfg_scale=5.0,
    max_num_frames=81,
    inference_steps=20,
    fps=23,
    num_persistent_param_in_dit=None,
    seed=1111
)


pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(args_template)
print("✅")


def generate_video(image_path, audio_path, prompt, output_dir="./output"):
    args_dict = vars(args_template).copy()
    args_dict.update({
        "image_path": image_path,
        "audio_path": audio_path,
        "prompt": prompt,
        "output_dir": output_dir
    })

    args = argparse.Namespace(**args_dict)
    return main(args, pipe, fantasytalking, wav2vec_processor, wav2vec)


def full_pipeline(user_audio, user_image):
    Path("./output").mkdir(parents=True, exist_ok=True)

    video_path = generate_video(
        image_path=user_image,
        audio_path=user_audio,
        prompt="..."  # تقدر تتركه فاضي أو تكتب أي شيء بسيط
    )

    return "", "", user_audio, video_path


with gr.Blocks() as demo:
    gr.Markdown("## Realtime Interactive Avatar 🎭")

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(label="Upload Voice", type="filepath")
            image_input = gr.Image(label="Upload Image", type="filepath")
            btn = gr.Button("Generate")

        with gr.Column():
            user_text = gr.Textbox(label="Transcribed Text (Speech to Text)")
            reply_text = gr.Textbox(label="Assistant Response (LLM)")
            reply_audio = gr.Audio(label="Spoken Response (Text to Speech)")
            video_output = gr.Video(label="Final Generated Video")

    btn.click(fn=full_pipeline,
              inputs=[audio_input, image_input],
              outputs=[user_text, reply_text, reply_audio, video_output])

demo.launch(inbrowser=True, share=True)