wedyanessam commited on
Commit
3402d0b
·
verified ·
1 Parent(s): a073983

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -15
app.py CHANGED
@@ -2,27 +2,89 @@ import gradio as gr
2
  from STT.sst import speech_to_text
3
  from LLM.llm import generate_reply
4
  from TTS_X.tts import generate_voice
5
- import download_models
 
 
6
 
7
- def process(audio):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- user_text = speech_to_text(audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  reply = generate_reply(user_text)
11
- reply_audio = generate_voice(reply)
12
-
13
- return user_text, reply, reply_audio
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- with gr.Blocks() as demo:
16
- gr.Markdown("## 🗣️➡️💬➡️🔊 من صوتك إلى رد منطوق!")
17
 
18
- audio_input = gr.Audio(label="🎤 ارفع صوتك", type="filepath")
19
- user_text = gr.Textbox(label="📜 النص المسموع")
20
- reply_text = gr.Textbox(label="🤖 رد المساعد")
21
- reply_audio = gr.Audio(label="🔊 الرد بالصوت")
 
22
 
23
- btn = gr.Button("ابدأ")
 
 
 
 
24
 
25
- btn.click(process, inputs=audio_input, outputs=[user_text, reply_text, reply_audio])
 
26
 
27
- demo.launch()
28
 
 
2
  from STT.sst import speech_to_text
3
  from LLM.llm import generate_reply
4
  from TTS_X.tts import generate_voice
5
+ from fantasy_talking.infer import load_models, main
6
+ from pathlib import Path
7
+ import argparse
8
 
9
+ # Load FantasyTalking models
10
+ pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(
11
+ argparse.Namespace(
12
+ wan_model_dir="./models/Wan2.1-I2V-14B-720P",
13
+ fantasytalking_model_path="./models/fantasytalking_model.ckpt",
14
+ wav2vec_model_dir="./models/wav2vec2-base-960h",
15
+ image_path="",
16
+ audio_path="",
17
+ prompt="",
18
+ output_dir="./output",
19
+ image_size=512,
20
+ audio_scale=1.0,
21
+ prompt_cfg_scale=5.0,
22
+ audio_cfg_scale=5.0,
23
+ max_num_frames=81,
24
+ inference_steps=20,
25
+ fps=23,
26
+ num_persistent_param_in_dit=None,
27
+ seed=1111
28
+ )
29
+ )
30
 
31
+ def generate_video(image_path, audio_path, prompt, output_dir):
32
+ args = argparse.Namespace(
33
+ wan_model_dir="./models/Wan2.1-I2V-14B-720P",
34
+ fantasytalking_model_path="./models/fantasytalking_model.ckpt",
35
+ wav2vec_model_dir="./models/wav2vec2-base-960h",
36
+ image_path=image_path,
37
+ audio_path=audio_path,
38
+ prompt=prompt,
39
+ output_dir=output_dir,
40
+ image_size=512,
41
+ audio_scale=1.0,
42
+ prompt_cfg_scale=5.0,
43
+ audio_cfg_scale=5.0,
44
+ max_num_frames=81,
45
+ inference_steps=20,
46
+ fps=23,
47
+ num_persistent_param_in_dit=None,
48
+ seed=1111
49
+ )
50
+ return main(args, pipe, fantasytalking, wav2vec_processor, wav2vec)
51
+
52
+ def full_pipeline(user_audio, user_image):
53
+ user_text = speech_to_text(user_audio)
54
  reply = generate_reply(user_text)
55
+ reply_audio_path = generate_voice(reply)
56
+
57
+ # Generate video from reply voice + user image
58
+ output_dir = "./output"
59
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
60
+
61
+ video_path = generate_video(
62
+ image_path=user_image,
63
+ audio_path=reply_audio_path,
64
+ prompt=reply,
65
+ output_dir=output_dir
66
+ )
67
+
68
+ return user_text, reply, reply_audio_path, video_path
69
+
70
 
71
+ with gr.Blocks(title="🧠 صوتك يحرك صورة!") as demo:
72
+ gr.Markdown("## 🎤➡️💬➡️🔊➡️📽️ من صوتك إلى فيديو متكلم!")
73
 
74
+ with gr.Row():
75
+ with gr.Column():
76
+ audio_input = gr.Audio(label="🎙️ ارفع صوتك", type="filepath")
77
+ image_input = gr.Image(label="🖼️ صورة المتحدث", type="filepath")
78
+ btn = gr.Button("🎬 شغل")
79
 
80
+ with gr.Column():
81
+ user_text = gr.Textbox(label="📝 النص المسموع")
82
+ reply_text = gr.Textbox(label="🤖 رد المساعد")
83
+ reply_audio = gr.Audio(label="🔊 الرد المنطوق")
84
+ video_output = gr.Video(label="📽️ الفيديو الناتج")
85
 
86
+ btn.click(fn=full_pipeline, inputs=[audio_input, image_input],
87
+ outputs=[user_text, reply_text, reply_audio, video_output])
88
 
89
+ demo.launch(inbrowser=True, share=True)
90