wedyanessam commited on
Commit
97e8796
ยท
verified ยท
1 Parent(s): aa3c3a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -45
app.py CHANGED
@@ -1,74 +1,68 @@
1
  import gradio as gr
 
 
 
2
  from STT.sst import speech_to_text
3
  from LLM.llm import generate_reply
4
  from TTS_X.tts import generate_voice
5
- from FantasyTalking.inference import generate_video
6
  from FantasyTalking.infer import load_models, main
7
- from pathlib import Path
8
- import argparse
9
 
10
- # Load FantasyTalking models
11
- pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(
12
- argparse.Namespace(
13
- wan_model_dir="./models/Wan2.1-I2V-14B-720P",
14
- fantasytalking_model_path="./models/fantasytalking_model.ckpt",
15
- wav2vec_model_dir="./models/wav2vec2-base-960h",
16
- image_path="",
17
- audio_path="",
18
- prompt="",
19
- output_dir="./output",
20
- image_size=512,
21
- audio_scale=1.0,
22
- prompt_cfg_scale=5.0,
23
- audio_cfg_scale=5.0,
24
- max_num_frames=81,
25
- inference_steps=20,
26
- fps=23,
27
- num_persistent_param_in_dit=None,
28
- seed=1111
29
- )
30
  )
31
 
32
- def generate_video(image_path, audio_path, prompt, output_dir):
 
 
 
33
  args = argparse.Namespace(
34
- wan_model_dir="./models/Wan2.1-I2V-14B-720P",
35
- fantasytalking_model_path="./models/fantasytalking_model.ckpt",
36
- wav2vec_model_dir="./models/wav2vec2-base-960h",
37
  image_path=image_path,
38
  audio_path=audio_path,
39
  prompt=prompt,
40
- output_dir=output_dir,
41
- image_size=512,
42
- audio_scale=1.0,
43
- prompt_cfg_scale=5.0,
44
- audio_cfg_scale=5.0,
45
- max_num_frames=81,
46
- inference_steps=20,
47
- fps=23,
48
- num_persistent_param_in_dit=None,
49
- seed=1111
50
  )
51
  return main(args, pipe, fantasytalking, wav2vec_processor, wav2vec)
52
 
 
53
  def full_pipeline(user_audio, user_image):
 
54
  user_text = speech_to_text(user_audio)
 
 
55
  reply = generate_reply(user_text)
56
- reply_audio_path = generate_voice(reply)
57
 
58
- # Generate video from reply voice + user image
59
- output_dir = "./output"
60
- Path(output_dir).mkdir(parents=True, exist_ok=True)
61
 
 
 
62
  video_path = generate_video(
63
  image_path=user_image,
64
  audio_path=reply_audio_path,
65
- prompt=reply,
66
- output_dir=output_dir
67
  )
68
 
69
  return user_text, reply, reply_audio_path, video_path
70
 
71
 
 
72
  with gr.Blocks(title="๐Ÿง  ุตูˆุชูƒ ูŠุญุฑูƒ ุตูˆุฑุฉ!") as demo:
73
  gr.Markdown("## ๐ŸŽคโžก๏ธ๐Ÿ’ฌโžก๏ธ๐Ÿ”Šโžก๏ธ๐Ÿ“ฝ๏ธ ู…ู† ุตูˆุชูƒ ุฅู„ู‰ ููŠุฏูŠูˆ ู…ุชูƒู„ู…!")
74
 
@@ -84,8 +78,8 @@ with gr.Blocks(title="๐Ÿง  ุตูˆุชูƒ ูŠุญุฑูƒ ุตูˆุฑุฉ!") as demo:
84
  reply_audio = gr.Audio(label="๐Ÿ”Š ุงู„ุฑุฏ ุงู„ู…ู†ุทูˆู‚")
85
  video_output = gr.Video(label="๐Ÿ“ฝ๏ธ ุงู„ููŠุฏูŠูˆ ุงู„ู†ุงุชุฌ")
86
 
87
- btn.click(fn=full_pipeline, inputs=[audio_input, image_input],
 
88
  outputs=[user_text, reply_text, reply_audio, video_output])
89
 
90
  demo.launch(inbrowser=True, share=True)
91
-
 
1
  import gradio as gr
2
+ from pathlib import Path
3
+ import argparse
4
+
5
  from STT.sst import speech_to_text
6
  from LLM.llm import generate_reply
7
  from TTS_X.tts import generate_voice
 
8
  from FantasyTalking.infer import load_models, main
 
 
9
 
10
+ # ุซุงุจุชุงุช ุชุญู…ูŠู„ ุงู„ู†ู…ูˆุฐุฌ
11
+ args_template = argparse.Namespace(
12
+ wan_model_dir="./models/Wan2.1-I2V-14B-720P",
13
+ fantasytalking_model_path="./models/fantasytalking_model.ckpt",
14
+ wav2vec_model_dir="./models/wav2vec2-base-960h",
15
+ image_path="",
16
+ audio_path="",
17
+ prompt="",
18
+ output_dir="./output",
19
+ image_size=512,
20
+ audio_scale=1.0,
21
+ prompt_cfg_scale=5.0,
22
+ audio_cfg_scale=5.0,
23
+ max_num_frames=81,
24
+ inference_steps=20,
25
+ fps=23,
26
+ num_persistent_param_in_dit=None,
27
+ seed=1111
 
 
28
  )
29
 
30
+ # ุชุญู…ูŠู„ ุงู„ู†ู…ุงุฐุฌ ู…ุฑุฉ ูˆุญุฏุฉ ูู‚ุท
31
+ pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(args_template)
32
+
33
+ def generate_video(image_path, audio_path, prompt, output_dir="./output"):
34
  args = argparse.Namespace(
35
+ **vars(args_template),
 
 
36
  image_path=image_path,
37
  audio_path=audio_path,
38
  prompt=prompt,
39
+ output_dir=output_dir
 
 
 
 
 
 
 
 
 
40
  )
41
  return main(args, pipe, fantasytalking, wav2vec_processor, wav2vec)
42
 
43
+
44
  def full_pipeline(user_audio, user_image):
45
+ # 1. ุชุญูˆูŠู„ ุงู„ุตูˆุช ุฅู„ู‰ ู†ุต
46
  user_text = speech_to_text(user_audio)
47
+
48
+ # 2. ุชูˆู„ูŠุฏ ุงู„ุฑุฏ ู…ู† LLM
49
  reply = generate_reply(user_text)
 
50
 
51
+ # 3. ุชุญูˆูŠู„ ุงู„ุฑุฏ ุฅู„ู‰ ุตูˆุช
52
+ reply_audio_path = generate_voice(reply)
 
53
 
54
+ # 4. ุชูˆู„ูŠุฏ ููŠุฏูŠูˆ ู…ู† ุงู„ุตูˆุฑุฉ ูˆุงู„ุตูˆุช
55
+ Path("./output").mkdir(parents=True, exist_ok=True)
56
  video_path = generate_video(
57
  image_path=user_image,
58
  audio_path=reply_audio_path,
59
+ prompt=reply
 
60
  )
61
 
62
  return user_text, reply, reply_audio_path, video_path
63
 
64
 
65
+ # ูˆุงุฌู‡ุฉ Gradio
66
  with gr.Blocks(title="๐Ÿง  ุตูˆุชูƒ ูŠุญุฑูƒ ุตูˆุฑุฉ!") as demo:
67
  gr.Markdown("## ๐ŸŽคโžก๏ธ๐Ÿ’ฌโžก๏ธ๐Ÿ”Šโžก๏ธ๐Ÿ“ฝ๏ธ ู…ู† ุตูˆุชูƒ ุฅู„ู‰ ููŠุฏูŠูˆ ู…ุชูƒู„ู…!")
68
 
 
78
  reply_audio = gr.Audio(label="๐Ÿ”Š ุงู„ุฑุฏ ุงู„ู…ู†ุทูˆู‚")
79
  video_output = gr.Video(label="๐Ÿ“ฝ๏ธ ุงู„ููŠุฏูŠูˆ ุงู„ู†ุงุชุฌ")
80
 
81
+ btn.click(fn=full_pipeline,
82
+ inputs=[audio_input, image_input],
83
  outputs=[user_text, reply_text, reply_audio, video_output])
84
 
85
  demo.launch(inbrowser=True, share=True)