pillaryao commited on
Commit
b207b64
·
verified ·
1 Parent(s): 3c524e6

Upload folder using huggingface_hub

Browse files
.github/workflows/update_space.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Run Python script
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - dev
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v2
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: '3.9'
20
+
21
+ - name: Install Gradio
22
+ run: python -m pip install gradio
23
+
24
+ - name: Log in to Hugging Face
25
+ run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
26
+
27
+ - name: Deploy to Spaces
28
+ run: gradio deploy
README.md CHANGED
@@ -2,7 +2,7 @@
2
  title: demo
3
  app_file: gradio_app.py
4
  sdk: gradio
5
- sdk_version: 5.12.0
6
  ---
7
  # LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync
8
 
 
2
  title: demo
3
  app_file: gradio_app.py
4
  sdk: gradio
5
+ sdk_version: 5.14.0
6
  ---
7
  # LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync
8
 
assets/demo1_video.mp4 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:092ff3cc8d8bc60490cfd4632745ea80ddb80e6e53e73fb9158aa9cdc6dee585
3
- size 2278221
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed2dd1e2001aa605c3f2d77672a8af4ed55e427a85c55d408adfc3d5076bc872
3
+ size 1240008
assets/demo2_video.mp4 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b275b97fea803c888da66c9a9815492edfb4545f2a7a85263946f11a6a4ae7b6
3
- size 3688891
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c3f10288e0642e587a95c0040e6966f8f6b7e003c3a17b572f72472b896d8ff
3
+ size 1772492
assets/demo3_video.mp4 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fee4988f803bea01deaaa8983a1ff42f6a83e30497ae53acd7decefb57de7a7d
3
- size 4710464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfa177b2a44f7809f606285c120e270d526caa50d708ec95e0f614d220970e0f
3
+ size 2112370
gradio_app.py CHANGED
@@ -1,160 +1,219 @@
1
  import gradio as gr
2
- from pathlib import Path
3
- from scripts.inference import main
4
- from omegaconf import OmegaConf
5
- import argparse
6
- from datetime import datetime
7
-
8
- CONFIG_PATH = Path("configs/unet/second_stage.yaml")
9
- CHECKPOINT_PATH = Path("checkpoints/latentsync_unet.pt")
10
-
11
-
12
- def process_video(
13
- video_path,
14
- audio_path,
15
- guidance_scale,
16
- inference_steps,
17
- seed,
18
- ):
19
- # Create the temp directory if it doesn't exist
20
- output_dir = Path("./temp")
21
- output_dir.mkdir(parents=True, exist_ok=True)
22
-
23
- # Convert paths to absolute Path objects and normalize them
24
- video_file_path = Path(video_path)
25
- video_path = video_file_path.absolute().as_posix()
26
- audio_path = Path(audio_path).absolute().as_posix()
27
-
28
- current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
29
- # Set the output path for the processed video
30
- output_path = str(output_dir / f"{video_file_path.stem}_{current_time}.mp4") # Change the filename as needed
31
-
32
- config = OmegaConf.load(CONFIG_PATH)
33
-
34
- config["run"].update(
35
- {
36
- "guidance_scale": guidance_scale,
37
- "inference_steps": inference_steps,
38
- }
39
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- # Parse the arguments
42
- args = create_args(video_path, audio_path, output_path, inference_steps, guidance_scale, seed)
43
-
44
- try:
45
- result = main(
46
- config=config,
47
- args=args,
48
- )
49
- print("Processing completed successfully.")
50
- return output_path # Ensure the output path is returned
51
- except Exception as e:
52
- print(f"Error during processing: {str(e)}")
53
- raise gr.Error(f"Error during processing: {str(e)}")
54
-
55
-
56
- def create_args(
57
- video_path: str, audio_path: str, output_path: str, inference_steps: int, guidance_scale: float, seed: int
58
- ) -> argparse.Namespace:
59
- parser = argparse.ArgumentParser()
60
- parser.add_argument("--inference_ckpt_path", type=str, required=True)
61
- parser.add_argument("--video_path", type=str, required=True)
62
- parser.add_argument("--audio_path", type=str, required=True)
63
- parser.add_argument("--video_out_path", type=str, required=True)
64
- parser.add_argument("--inference_steps", type=int, default=20)
65
- parser.add_argument("--guidance_scale", type=float, default=1.0)
66
- parser.add_argument("--seed", type=int, default=1247)
67
-
68
- return parser.parse_args(
69
- [
70
- "--inference_ckpt_path",
71
- CHECKPOINT_PATH.absolute().as_posix(),
72
- "--video_path",
73
- video_path,
74
- "--audio_path",
75
- audio_path,
76
- "--video_out_path",
77
- output_path,
78
- "--inference_steps",
79
- str(inference_steps),
80
- "--guidance_scale",
81
- str(guidance_scale),
82
- "--seed",
83
- str(seed),
84
- ]
 
 
 
 
 
 
 
85
  )
86
 
87
-
88
- # Create Gradio interface
89
- with gr.Blocks(title="LatentSync Video Processing") as demo:
90
- gr.Markdown(
91
- """
92
- # LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync
93
- Upload a video and audio file to process with LatentSync model.
94
-
95
- <div align="center">
96
- <strong>Chunyu Li1,2 Chao Zhang1 Weikai Xu1 Jinghui Xie1,† Weiguo Feng1
97
- Bingyue Peng1 Weiwei Xing2,†</strong>
98
- </div>
99
-
100
- <div align="center">
101
- <strong>1ByteDance 2Beijing Jiaotong University</strong>
102
- </div>
103
-
104
- <div style="display:flex;justify-content:center;column-gap:4px;">
105
- <a href="https://github.com/bytedance/LatentSync">
106
- <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
107
- </a>
108
- <a href="https://arxiv.org/pdf/2412.09262">
109
- <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
110
- </a>
111
- </div>
112
- """
 
 
 
 
 
 
 
 
 
113
  )
114
 
115
- with gr.Row():
116
- with gr.Column():
117
- video_input = gr.Video(label="Input Video")
118
- audio_input = gr.Audio(label="Input Audio", type="filepath")
119
-
120
- with gr.Row():
121
- guidance_scale = gr.Slider(
122
- minimum=1.0,
123
- maximum=3.5,
124
- value=1.5,
125
- step=0.5,
126
- label="Guidance Scale",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  )
128
- inference_steps = gr.Slider(minimum=10, maximum=50, value=20, step=1, label="Inference Steps")
129
-
130
- with gr.Row():
131
- seed = gr.Number(value=1247, label="Random Seed", precision=0)
132
-
133
- process_btn = gr.Button("Process Video")
134
-
135
- with gr.Column():
136
- video_output = gr.Video(label="Output Video")
137
-
138
- gr.Examples(
139
- examples=[
140
- ["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
141
- ["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
142
- ["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
143
- ],
144
- inputs=[video_input, audio_input],
145
- )
146
-
147
- process_btn.click(
148
- fn=process_video,
149
- inputs=[
150
- video_input,
151
- audio_input,
152
- guidance_scale,
153
- inference_steps,
154
- seed,
155
- ],
156
- outputs=video_output,
157
  )
158
 
159
- if __name__ == "__main__":
160
- demo.launch(inbrowser=True, share=True)
 
1
  import gradio as gr
2
+ import os
3
+ import sys
4
+ import shutil
5
+ import uuid
6
+ import subprocess
7
+ from glob import glob
8
+ from huggingface_hub import snapshot_download
9
+
10
+ # Download models
11
+ os.makedirs("checkpoints", exist_ok=True)
12
+
13
+ snapshot_download(
14
+ repo_id = "chunyu-li/LatentSync",
15
+ local_dir = "./checkpoints"
16
+ )
17
+
18
+ import tempfile
19
+ from moviepy.editor import VideoFileClip
20
+ from pydub import AudioSegment
21
+
22
+ def process_video(input_video_path, temp_dir="temp_dir"):
23
+ """
24
+ Crop a given MP4 video to a maximum duration of 10 seconds if it is longer than 10 seconds.
25
+ Save the new video in the specified folder (default is temp_dir).
26
+
27
+ Args:
28
+ input_video_path (str): Path to the input video file.
29
+ temp_dir (str): Directory where the processed video will be saved.
30
+
31
+ Returns:
32
+ str: Path to the cropped video file.
33
+ """
34
+ # Ensure the temp_dir exists
35
+ os.makedirs(temp_dir, exist_ok=True)
36
+
37
+ # Load the video
38
+ video = VideoFileClip(input_video_path)
39
+
40
+ # Determine the output path
41
+ input_file_name = os.path.basename(input_video_path)
42
+ output_video_path = os.path.join(temp_dir, f"cropped_{input_file_name}")
43
+
44
+ # Crop the video to 10 seconds if necessary
45
+ if video.duration > 10:
46
+ video = video.subclip(0, 10)
47
+
48
+ # Write the cropped video to the output path
49
+ video.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
50
+
51
+ # Return the path to the cropped video
52
+ return output_video_path
53
+
54
+ def process_audio(file_path, temp_dir):
55
+ # Load the audio file
56
+ audio = AudioSegment.from_file(file_path)
57
+
58
+ # Check and cut the audio if longer than 4 seconds
59
+ max_duration = 8 * 1000 # 4 seconds in milliseconds
60
+ if len(audio) > max_duration:
61
+ audio = audio[:max_duration]
62
+
63
+ # Save the processed audio in the temporary directory
64
+ output_path = os.path.join(temp_dir, "trimmed_audio.wav")
65
+ audio.export(output_path, format="wav")
66
+
67
+ # Return the path to the trimmed file
68
+ print(f"Processed audio saved at: {output_path}")
69
+ return output_path
70
 
71
+ import argparse
72
+ from omegaconf import OmegaConf
73
+ import torch
74
+ from diffusers import AutoencoderKL, DDIMScheduler
75
+ from latentsync.models.unet import UNet3DConditionModel
76
+ from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
77
+ from diffusers.utils.import_utils import is_xformers_available
78
+ from accelerate.utils import set_seed
79
+ from latentsync.whisper.audio2feature import Audio2Feature
80
+
81
+
82
+ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
83
+ inference_ckpt_path = "checkpoints/latentsync_unet.pt"
84
+ unet_config_path = "configs/unet/second_stage.yaml"
85
+ config = OmegaConf.load(unet_config_path)
86
+
87
+ print(f"Input video path: {video_path}")
88
+ print(f"Input audio path: {audio_path}")
89
+ print(f"Loaded checkpoint path: {inference_ckpt_path}")
90
+
91
+ is_shared_ui = True if "fffiloni/LatentSync" in os.environ['SPACE_ID'] else False
92
+ temp_dir = None
93
+ if is_shared_ui:
94
+ temp_dir = tempfile.mkdtemp()
95
+ cropped_video_path = process_video(video_path)
96
+ print(f"Cropped video saved to: {cropped_video_path}")
97
+ video_path=cropped_video_path
98
+
99
+ trimmed_audio_path = process_audio(audio_path, temp_dir)
100
+ print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
101
+ audio_path=trimmed_audio_path
102
+
103
+ scheduler = DDIMScheduler.from_pretrained("configs")
104
+
105
+ if config.model.cross_attention_dim == 768:
106
+ whisper_model_path = "checkpoints/whisper/small.pt"
107
+ elif config.model.cross_attention_dim == 384:
108
+ whisper_model_path = "checkpoints/whisper/tiny.pt"
109
+ else:
110
+ raise NotImplementedError("cross_attention_dim must be 768 or 384")
111
+
112
+ audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
113
+
114
+ vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
115
+ vae.config.scaling_factor = 0.18215
116
+ vae.config.shift_factor = 0
117
+
118
+ unet, _ = UNet3DConditionModel.from_pretrained(
119
+ OmegaConf.to_container(config.model),
120
+ inference_ckpt_path, # load checkpoint
121
+ device="cpu",
122
  )
123
 
124
+ unet = unet.to(dtype=torch.float16)
125
+
126
+ # set xformers
127
+ if is_xformers_available():
128
+ unet.enable_xformers_memory_efficient_attention()
129
+
130
+ pipeline = LipsyncPipeline(
131
+ vae=vae,
132
+ audio_encoder=audio_encoder,
133
+ unet=unet,
134
+ scheduler=scheduler,
135
+ ).to("cuda")
136
+
137
+ seed = -1
138
+ if seed != -1:
139
+ set_seed(seed)
140
+ else:
141
+ torch.seed()
142
+
143
+ print(f"Initial seed: {torch.initial_seed()}")
144
+
145
+ unique_id = str(uuid.uuid4())
146
+ video_out_path = f"video_out{unique_id}.mp4"
147
+
148
+ pipeline(
149
+ video_path=video_path,
150
+ audio_path=audio_path,
151
+ video_out_path=video_out_path,
152
+ video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
153
+ num_frames=config.data.num_frames,
154
+ num_inference_steps=config.run.inference_steps,
155
+ guidance_scale=1.0,
156
+ weight_dtype=torch.float16,
157
+ width=config.data.resolution,
158
+ height=config.data.resolution,
159
  )
160
 
161
+ if is_shared_ui:
162
+ # Clean up the temporary directory
163
+ if os.path.exists(temp_dir):
164
+ shutil.rmtree(temp_dir)
165
+ print(f"Temporary directory {temp_dir} deleted.")
166
+
167
+ return video_out_path
168
+
169
+
170
+ css="""
171
+ div#col-container{
172
+ margin: 0 auto;
173
+ max-width: 982px;
174
+ }
175
+ """
176
+ with gr.Blocks(css=css) as demo:
177
+ with gr.Column(elem_id="col-container"):
178
+ gr.Markdown("# LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync")
179
+ gr.Markdown("LatentSync, an end-to-end lip sync framework based on audio conditioned latent diffusion models without any intermediate motion representation, diverging from previous diffusion-based lip sync methods based on pixel space diffusion or two-stage generation.")
180
+ gr.HTML("""
181
+ <div style="display:flex;column-gap:4px;">
182
+ <a href="https://github.com/bytedance/LatentSync">
183
+ <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
184
+ </a>
185
+ <a href="https://arxiv.org/abs/2412.09262">
186
+ <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
187
+ </a>
188
+ <a href="https://huggingface.co/spaces/fffiloni/LatentSync?duplicate=true">
189
+ <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
190
+ </a>
191
+ <a href="https://huggingface.co/fffiloni">
192
+ <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-sm-dark.svg" alt="Follow me on HF">
193
+ </a>
194
+ </div>
195
+ """)
196
+ with gr.Row():
197
+ with gr.Column():
198
+ video_input = gr.Video(label="Video Control", format="mp4")
199
+ audio_input = gr.Audio(label="Audio Input", type="filepath")
200
+ submit_btn = gr.Button("Submit")
201
+ with gr.Column():
202
+ video_result = gr.Video(label="Result")
203
+
204
+ gr.Examples(
205
+ examples = [
206
+ ["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
207
+ ["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
208
+ ["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
209
+ ],
210
+ inputs = [video_input, audio_input]
211
  )
212
+
213
+ submit_btn.click(
214
+ fn = main,
215
+ inputs = [video_input, audio_input],
216
+ outputs = [video_result]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  )
218
 
219
+ demo.queue().launch(show_api=False, show_error=True)