import subprocess import os import sys import shutil from pathlib import Path import argparse import gradio as gr from STT.sst import speech_to_text from LLM.llm import generate_reply from TTS_X.tts import generate_voice from FantasyTalking.infer import load_models, main # downloading of models if didn't exist if not os.path.exists("./models/fantasytalking_model.ckpt"): subprocess.run(["python", "download_models.py"]) sys.path.append(os.path.abspath(".")) args_template = argparse.Namespace( fantasytalking_model_path="./models/fantasytalking_model.ckpt", wav2vec_model_dir="./models/wav2vec2-base-960h", wan_model_dir="./models/Wan2.1-I2V-14B-720P", image_path="", audio_path="", prompt="", output_dir="./output", image_size=512, audio_scale=1.0, prompt_cfg_scale=5.0, audio_cfg_scale=5.0, max_num_frames=81, inference_steps=20, fps=23, num_persistent_param_in_dit=None, seed=1111 ) pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(args_template) print("✅") def generate_video(image_path, audio_path, prompt, output_dir="./output"): args_dict = vars(args_template).copy() args_dict.update({ "image_path": image_path, "audio_path": audio_path, "prompt": prompt, "output_dir": output_dir }) args = argparse.Namespace(**args_dict) return main(args, pipe, fantasytalking, wav2vec_processor, wav2vec) def full_pipeline(user_audio, user_image): Path("./output").mkdir(parents=True, exist_ok=True) video_path = generate_video( image_path=user_image, audio_path=user_audio, prompt="..." # تقدر تتركه فاضي أو تكتب أي شيء بسيط ) return "", "", user_audio, video_path with gr.Blocks() as demo: gr.Markdown("## Realtime Interactive Avatar 🎭") with gr.Row(): with gr.Column(): audio_input = gr.Audio(label="Upload Voice", type="filepath") image_input = gr.Image(label="Upload Image", type="filepath") btn = gr.Button("Generate") with gr.Column(): user_text = gr.Textbox(label="Transcribed Text (Speech to Text)") reply_text = gr.Textbox(label="Assistant Response (LLM)") reply_audio = gr.Audio(label="Spoken Response (Text to Speech)") video_output = gr.Video(label="Final Generated Video") btn.click(fn=full_pipeline, inputs=[audio_input, image_input], outputs=[user_text, reply_text, reply_audio, video_output]) demo.launch(inbrowser=True, share=True)