pillaryao commited on
Commit
c3c5a50
·
verified ·
1 Parent(s): b207b64

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. gradio_app.py +152 -202
gradio_app.py CHANGED
@@ -1,219 +1,169 @@
1
  import gradio as gr
2
- import os
3
- import sys
4
- import shutil
5
- import uuid
6
- import subprocess
7
- from glob import glob
8
- from huggingface_hub import snapshot_download
9
 
10
  # Download models
 
 
11
  os.makedirs("checkpoints", exist_ok=True)
12
-
13
  snapshot_download(
14
  repo_id = "chunyu-li/LatentSync",
15
  local_dir = "./checkpoints"
16
  )
17
 
18
- import tempfile
19
- from moviepy.editor import VideoFileClip
20
- from pydub import AudioSegment
21
-
22
- def process_video(input_video_path, temp_dir="temp_dir"):
23
- """
24
- Crop a given MP4 video to a maximum duration of 10 seconds if it is longer than 10 seconds.
25
- Save the new video in the specified folder (default is temp_dir).
26
-
27
- Args:
28
- input_video_path (str): Path to the input video file.
29
- temp_dir (str): Directory where the processed video will be saved.
30
-
31
- Returns:
32
- str: Path to the cropped video file.
33
- """
34
- # Ensure the temp_dir exists
35
- os.makedirs(temp_dir, exist_ok=True)
36
-
37
- # Load the video
38
- video = VideoFileClip(input_video_path)
39
-
40
- # Determine the output path
41
- input_file_name = os.path.basename(input_video_path)
42
- output_video_path = os.path.join(temp_dir, f"cropped_{input_file_name}")
43
-
44
- # Crop the video to 10 seconds if necessary
45
- if video.duration > 10:
46
- video = video.subclip(0, 10)
47
-
48
- # Write the cropped video to the output path
49
- video.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
50
-
51
- # Return the path to the cropped video
52
- return output_video_path
53
-
54
- def process_audio(file_path, temp_dir):
55
- # Load the audio file
56
- audio = AudioSegment.from_file(file_path)
57
-
58
- # Check and cut the audio if longer than 4 seconds
59
- max_duration = 8 * 1000 # 4 seconds in milliseconds
60
- if len(audio) > max_duration:
61
- audio = audio[:max_duration]
62
-
63
- # Save the processed audio in the temporary directory
64
- output_path = os.path.join(temp_dir, "trimmed_audio.wav")
65
- audio.export(output_path, format="wav")
66
-
67
- # Return the path to the trimmed file
68
- print(f"Processed audio saved at: {output_path}")
69
- return output_path
70
 
71
- import argparse
72
- from omegaconf import OmegaConf
73
- import torch
74
- from diffusers import AutoencoderKL, DDIMScheduler
75
- from latentsync.models.unet import UNet3DConditionModel
76
- from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
77
- from diffusers.utils.import_utils import is_xformers_available
78
- from accelerate.utils import set_seed
79
- from latentsync.whisper.audio2feature import Audio2Feature
80
-
81
-
82
- def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
83
- inference_ckpt_path = "checkpoints/latentsync_unet.pt"
84
- unet_config_path = "configs/unet/second_stage.yaml"
85
- config = OmegaConf.load(unet_config_path)
86
-
87
- print(f"Input video path: {video_path}")
88
- print(f"Input audio path: {audio_path}")
89
- print(f"Loaded checkpoint path: {inference_ckpt_path}")
90
-
91
- is_shared_ui = True if "fffiloni/LatentSync" in os.environ['SPACE_ID'] else False
92
- temp_dir = None
93
- if is_shared_ui:
94
- temp_dir = tempfile.mkdtemp()
95
- cropped_video_path = process_video(video_path)
96
- print(f"Cropped video saved to: {cropped_video_path}")
97
- video_path=cropped_video_path
98
-
99
- trimmed_audio_path = process_audio(audio_path, temp_dir)
100
- print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
101
- audio_path=trimmed_audio_path
102
-
103
- scheduler = DDIMScheduler.from_pretrained("configs")
104
-
105
- if config.model.cross_attention_dim == 768:
106
- whisper_model_path = "checkpoints/whisper/small.pt"
107
- elif config.model.cross_attention_dim == 384:
108
- whisper_model_path = "checkpoints/whisper/tiny.pt"
109
- else:
110
- raise NotImplementedError("cross_attention_dim must be 768 or 384")
111
-
112
- audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
113
-
114
- vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
115
- vae.config.scaling_factor = 0.18215
116
- vae.config.shift_factor = 0
117
-
118
- unet, _ = UNet3DConditionModel.from_pretrained(
119
- OmegaConf.to_container(config.model),
120
- inference_ckpt_path, # load checkpoint
121
- device="cpu",
122
  )
123
 
124
- unet = unet.to(dtype=torch.float16)
125
-
126
- # set xformers
127
- if is_xformers_available():
128
- unet.enable_xformers_memory_efficient_attention()
129
-
130
- pipeline = LipsyncPipeline(
131
- vae=vae,
132
- audio_encoder=audio_encoder,
133
- unet=unet,
134
- scheduler=scheduler,
135
- ).to("cuda")
136
-
137
- seed = -1
138
- if seed != -1:
139
- set_seed(seed)
140
- else:
141
- torch.seed()
142
-
143
- print(f"Initial seed: {torch.initial_seed()}")
144
-
145
- unique_id = str(uuid.uuid4())
146
- video_out_path = f"video_out{unique_id}.mp4"
147
-
148
- pipeline(
149
- video_path=video_path,
150
- audio_path=audio_path,
151
- video_out_path=video_out_path,
152
- video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
153
- num_frames=config.data.num_frames,
154
- num_inference_steps=config.run.inference_steps,
155
- guidance_scale=1.0,
156
- weight_dtype=torch.float16,
157
- width=config.data.resolution,
158
- height=config.data.resolution,
159
  )
160
 
161
- if is_shared_ui:
162
- # Clean up the temporary directory
163
- if os.path.exists(temp_dir):
164
- shutil.rmtree(temp_dir)
165
- print(f"Temporary directory {temp_dir} deleted.")
166
-
167
- return video_out_path
168
-
169
-
170
- css="""
171
- div#col-container{
172
- margin: 0 auto;
173
- max-width: 982px;
174
- }
175
- """
176
- with gr.Blocks(css=css) as demo:
177
- with gr.Column(elem_id="col-container"):
178
- gr.Markdown("# LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync")
179
- gr.Markdown("LatentSync, an end-to-end lip sync framework based on audio conditioned latent diffusion models without any intermediate motion representation, diverging from previous diffusion-based lip sync methods based on pixel space diffusion or two-stage generation.")
180
- gr.HTML("""
181
- <div style="display:flex;column-gap:4px;">
182
- <a href="https://github.com/bytedance/LatentSync">
183
- <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
184
- </a>
185
- <a href="https://arxiv.org/abs/2412.09262">
186
- <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
187
- </a>
188
- <a href="https://huggingface.co/spaces/fffiloni/LatentSync?duplicate=true">
189
- <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
190
- </a>
191
- <a href="https://huggingface.co/fffiloni">
192
- <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-sm-dark.svg" alt="Follow me on HF">
193
- </a>
194
- </div>
195
- """)
196
- with gr.Row():
197
- with gr.Column():
198
- video_input = gr.Video(label="Video Control", format="mp4")
199
- audio_input = gr.Audio(label="Audio Input", type="filepath")
200
- submit_btn = gr.Button("Submit")
201
- with gr.Column():
202
- video_result = gr.Video(label="Result")
203
-
204
- gr.Examples(
205
- examples = [
206
- ["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
207
- ["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
208
- ["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
209
- ],
210
- inputs = [video_input, audio_input]
211
  )
212
-
213
- submit_btn.click(
214
- fn = main,
215
- inputs = [video_input, audio_input],
216
- outputs = [video_result]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  )
218
 
219
- demo.queue().launch(show_api=False, show_error=True)
 
 
1
  import gradio as gr
2
+ from pathlib import Path
3
+ from scripts.inference import main
4
+ from omegaconf import OmegaConf
5
+ import argparse
6
+ from datetime import datetime
 
 
7
 
8
  # Download models
9
+ from huggingface_hub import snapshot_download
10
+ import os
11
  os.makedirs("checkpoints", exist_ok=True)
 
12
  snapshot_download(
13
  repo_id = "chunyu-li/LatentSync",
14
  local_dir = "./checkpoints"
15
  )
16
 
17
+ CONFIG_PATH = Path("configs/unet/second_stage.yaml")
18
+ CHECKPOINT_PATH = Path("checkpoints/latentsync_unet.pt")
19
+
20
+
21
+ def process_video(
22
+ video_path,
23
+ audio_path,
24
+ guidance_scale,
25
+ inference_steps,
26
+ seed,
27
+ ):
28
+ # Create the temp directory if it doesn't exist
29
+ output_dir = Path("./temp")
30
+ output_dir.mkdir(parents=True, exist_ok=True)
31
+
32
+ # Convert paths to absolute Path objects and normalize them
33
+ video_file_path = Path(video_path)
34
+ video_path = video_file_path.absolute().as_posix()
35
+ audio_path = Path(audio_path).absolute().as_posix()
36
+
37
+ current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
38
+ # Set the output path for the processed video
39
+ output_path = str(output_dir / f"{video_file_path.stem}_{current_time}.mp4") # Change the filename as needed
40
+
41
+ config = OmegaConf.load(CONFIG_PATH)
42
+
43
+ config["run"].update(
44
+ {
45
+ "guidance_scale": guidance_scale,
46
+ "inference_steps": inference_steps,
47
+ }
48
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Parse the arguments
51
+ args = create_args(video_path, audio_path, output_path, inference_steps, guidance_scale, seed)
52
+
53
+ try:
54
+ result = main(
55
+ config=config,
56
+ args=args,
57
+ )
58
+ print("Processing completed successfully.")
59
+ return output_path # Ensure the output path is returned
60
+ except Exception as e:
61
+ print(f"Error during processing: {str(e)}")
62
+ raise gr.Error(f"Error during processing: {str(e)}")
63
+
64
+
65
+ def create_args(
66
+ video_path: str, audio_path: str, output_path: str, inference_steps: int, guidance_scale: float, seed: int
67
+ ) -> argparse.Namespace:
68
+ parser = argparse.ArgumentParser()
69
+ parser.add_argument("--inference_ckpt_path", type=str, required=True)
70
+ parser.add_argument("--video_path", type=str, required=True)
71
+ parser.add_argument("--audio_path", type=str, required=True)
72
+ parser.add_argument("--video_out_path", type=str, required=True)
73
+ parser.add_argument("--inference_steps", type=int, default=20)
74
+ parser.add_argument("--guidance_scale", type=float, default=1.0)
75
+ parser.add_argument("--seed", type=int, default=1247)
76
+
77
+ return parser.parse_args(
78
+ [
79
+ "--inference_ckpt_path",
80
+ CHECKPOINT_PATH.absolute().as_posix(),
81
+ "--video_path",
82
+ video_path,
83
+ "--audio_path",
84
+ audio_path,
85
+ "--video_out_path",
86
+ output_path,
87
+ "--inference_steps",
88
+ str(inference_steps),
89
+ "--guidance_scale",
90
+ str(guidance_scale),
91
+ "--seed",
92
+ str(seed),
93
+ ]
 
 
 
 
 
 
 
94
  )
95
 
96
+
97
+ # Create Gradio interface
98
+ with gr.Blocks(title="LatentSync Video Processing") as demo:
99
+ gr.Markdown(
100
+ """
101
+ # LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync
102
+ Upload a video and audio file to process with LatentSync model.
103
+
104
+ <div align="center">
105
+ <strong>Chunyu Li1,2 Chao Zhang1 Weikai Xu1 Jinghui Xie1,† Weiguo Feng1
106
+ Bingyue Peng1 Weiwei Xing2,†</strong>
107
+ </div>
108
+
109
+ <div align="center">
110
+ <strong>1ByteDance 2Beijing Jiaotong University</strong>
111
+ </div>
112
+
113
+ <div style="display:flex;justify-content:center;column-gap:4px;">
114
+ <a href="https://github.com/bytedance/LatentSync">
115
+ <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
116
+ </a>
117
+ <a href="https://arxiv.org/pdf/2412.09262">
118
+ <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
119
+ </a>
120
+ </div>
121
+ """
 
 
 
 
 
 
 
 
 
122
  )
123
 
124
+ with gr.Row():
125
+ with gr.Column():
126
+ video_input = gr.Video(label="Input Video")
127
+ audio_input = gr.Audio(label="Input Audio", type="filepath")
128
+
129
+ with gr.Row():
130
+ guidance_scale = gr.Slider(
131
+ minimum=1.0,
132
+ maximum=3.5,
133
+ value=1.5,
134
+ step=0.5,
135
+ label="Guidance Scale",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  )
137
+ inference_steps = gr.Slider(minimum=10, maximum=50, value=20, step=1, label="Inference Steps")
138
+
139
+ with gr.Row():
140
+ seed = gr.Number(value=1247, label="Random Seed", precision=0)
141
+
142
+ process_btn = gr.Button("Process Video")
143
+
144
+ with gr.Column():
145
+ video_output = gr.Video(label="Output Video")
146
+
147
+ gr.Examples(
148
+ examples=[
149
+ ["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
150
+ ["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
151
+ ["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
152
+ ],
153
+ inputs=[video_input, audio_input],
154
+ )
155
+
156
+ process_btn.click(
157
+ fn=process_video,
158
+ inputs=[
159
+ video_input,
160
+ audio_input,
161
+ guidance_scale,
162
+ inference_steps,
163
+ seed,
164
+ ],
165
+ outputs=video_output,
166
  )
167
 
168
+ if __name__ == "__main__":
169
+ demo.launch(inbrowser=True, share=True)