demo / gradio_app.py
pillaryao's picture
Upload folder using huggingface_hub
c3c5a50 verified
import gradio as gr
from pathlib import Path
from scripts.inference import main
from omegaconf import OmegaConf
import argparse
from datetime import datetime
# Download models
from huggingface_hub import snapshot_download
import os
os.makedirs("checkpoints", exist_ok=True)
snapshot_download(
repo_id = "chunyu-li/LatentSync",
local_dir = "./checkpoints"
)
CONFIG_PATH = Path("configs/unet/second_stage.yaml")
CHECKPOINT_PATH = Path("checkpoints/latentsync_unet.pt")
def process_video(
video_path,
audio_path,
guidance_scale,
inference_steps,
seed,
):
# Create the temp directory if it doesn't exist
output_dir = Path("./temp")
output_dir.mkdir(parents=True, exist_ok=True)
# Convert paths to absolute Path objects and normalize them
video_file_path = Path(video_path)
video_path = video_file_path.absolute().as_posix()
audio_path = Path(audio_path).absolute().as_posix()
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
# Set the output path for the processed video
output_path = str(output_dir / f"{video_file_path.stem}_{current_time}.mp4") # Change the filename as needed
config = OmegaConf.load(CONFIG_PATH)
config["run"].update(
{
"guidance_scale": guidance_scale,
"inference_steps": inference_steps,
}
)
# Parse the arguments
args = create_args(video_path, audio_path, output_path, inference_steps, guidance_scale, seed)
try:
result = main(
config=config,
args=args,
)
print("Processing completed successfully.")
return output_path # Ensure the output path is returned
except Exception as e:
print(f"Error during processing: {str(e)}")
raise gr.Error(f"Error during processing: {str(e)}")
def create_args(
video_path: str, audio_path: str, output_path: str, inference_steps: int, guidance_scale: float, seed: int
) -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--inference_ckpt_path", type=str, required=True)
parser.add_argument("--video_path", type=str, required=True)
parser.add_argument("--audio_path", type=str, required=True)
parser.add_argument("--video_out_path", type=str, required=True)
parser.add_argument("--inference_steps", type=int, default=20)
parser.add_argument("--guidance_scale", type=float, default=1.0)
parser.add_argument("--seed", type=int, default=1247)
return parser.parse_args(
[
"--inference_ckpt_path",
CHECKPOINT_PATH.absolute().as_posix(),
"--video_path",
video_path,
"--audio_path",
audio_path,
"--video_out_path",
output_path,
"--inference_steps",
str(inference_steps),
"--guidance_scale",
str(guidance_scale),
"--seed",
str(seed),
]
)
# Create Gradio interface
with gr.Blocks(title="LatentSync Video Processing") as demo:
gr.Markdown(
"""
# LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync
Upload a video and audio file to process with LatentSync model.
<div align="center">
<strong>Chunyu Li1,2 Chao Zhang1 Weikai Xu1 Jinghui Xie1,† Weiguo Feng1
Bingyue Peng1 Weiwei Xing2,†</strong>
</div>
<div align="center">
<strong>1ByteDance 2Beijing Jiaotong University</strong>
</div>
<div style="display:flex;justify-content:center;column-gap:4px;">
<a href="https://github.com/bytedance/LatentSync">
<img src='https://img.shields.io/badge/GitHub-Repo-blue'>
</a>
<a href="https://arxiv.org/pdf/2412.09262">
<img src='https://img.shields.io/badge/ArXiv-Paper-red'>
</a>
</div>
"""
)
with gr.Row():
with gr.Column():
video_input = gr.Video(label="Input Video")
audio_input = gr.Audio(label="Input Audio", type="filepath")
with gr.Row():
guidance_scale = gr.Slider(
minimum=1.0,
maximum=3.5,
value=1.5,
step=0.5,
label="Guidance Scale",
)
inference_steps = gr.Slider(minimum=10, maximum=50, value=20, step=1, label="Inference Steps")
with gr.Row():
seed = gr.Number(value=1247, label="Random Seed", precision=0)
process_btn = gr.Button("Process Video")
with gr.Column():
video_output = gr.Video(label="Output Video")
gr.Examples(
examples=[
["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
],
inputs=[video_input, audio_input],
)
process_btn.click(
fn=process_video,
inputs=[
video_input,
audio_input,
guidance_scale,
inference_steps,
seed,
],
outputs=video_output,
)
if __name__ == "__main__":
demo.launch(inbrowser=True, share=True)