Spaces:

pillaryao
/

demo

Running

App Files Files Community

pillaryao commited on 12 days ago

Commit

c3c5a50

verified ·

1 Parent(s): b207b64

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

gradio_app.py +152 -202

gradio_app.py CHANGED Viewed

@@ -1,219 +1,169 @@
 import gradio as gr
-import os
-import sys
-import shutil
-import uuid
-import subprocess
-from glob import glob
-from huggingface_hub import snapshot_download
 # Download models
 os.makedirs("checkpoints", exist_ok=True)
 snapshot_download(
     repo_id = "chunyu-li/LatentSync",
     local_dir = "./checkpoints"
 )
-import tempfile
-from moviepy.editor import VideoFileClip
-from pydub import AudioSegment
-def process_video(input_video_path, temp_dir="temp_dir"):
-    """
-    Crop a given MP4 video to a maximum duration of 10 seconds if it is longer than 10 seconds.
-    Save the new video in the specified folder (default is temp_dir).
-    Args:
-        input_video_path (str): Path to the input video file.
-        temp_dir (str): Directory where the processed video will be saved.
-    Returns:
-        str: Path to the cropped video file.
-    """
-    # Ensure the temp_dir exists
-    os.makedirs(temp_dir, exist_ok=True)
-    # Load the video
-    video = VideoFileClip(input_video_path)
-    # Determine the output path
-    input_file_name = os.path.basename(input_video_path)
-    output_video_path = os.path.join(temp_dir, f"cropped_{input_file_name}")
-    # Crop the video to 10 seconds if necessary
-    if video.duration > 10:
-        video = video.subclip(0, 10)
-    # Write the cropped video to the output path
-    video.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
-    # Return the path to the cropped video
-    return output_video_path
-def process_audio(file_path, temp_dir):
-    # Load the audio file
-    audio = AudioSegment.from_file(file_path)
-    # Check and cut the audio if longer than 4 seconds
-    max_duration = 8 * 1000  # 4 seconds in milliseconds
-    if len(audio) > max_duration:
-        audio = audio[:max_duration]
-    # Save the processed audio in the temporary directory
-    output_path = os.path.join(temp_dir, "trimmed_audio.wav")
-    audio.export(output_path, format="wav")
-    # Return the path to the trimmed file
-    print(f"Processed audio saved at: {output_path}")
-    return output_path
-import argparse
-from omegaconf import OmegaConf
-import torch
-from diffusers import AutoencoderKL, DDIMScheduler
-from latentsync.models.unet import UNet3DConditionModel
-from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
-from diffusers.utils.import_utils import is_xformers_available
-from accelerate.utils import set_seed
-from latentsync.whisper.audio2feature import Audio2Feature
-def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
-    inference_ckpt_path = "checkpoints/latentsync_unet.pt"
-    unet_config_path = "configs/unet/second_stage.yaml"
-    config = OmegaConf.load(unet_config_path)
-    print(f"Input video path: {video_path}")
-    print(f"Input audio path: {audio_path}")
-    print(f"Loaded checkpoint path: {inference_ckpt_path}")
-    is_shared_ui = True if "fffiloni/LatentSync" in os.environ['SPACE_ID'] else False
-    temp_dir = None
-    if is_shared_ui:
-        temp_dir = tempfile.mkdtemp()
-        cropped_video_path = process_video(video_path)
-        print(f"Cropped video saved to: {cropped_video_path}")
-        video_path=cropped_video_path
-        trimmed_audio_path = process_audio(audio_path, temp_dir)
-        print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
-        audio_path=trimmed_audio_path
-    scheduler = DDIMScheduler.from_pretrained("configs")
-    if config.model.cross_attention_dim == 768:
-        whisper_model_path = "checkpoints/whisper/small.pt"
-    elif config.model.cross_attention_dim == 384:
-        whisper_model_path = "checkpoints/whisper/tiny.pt"
-    else:
-        raise NotImplementedError("cross_attention_dim must be 768 or 384")
-    audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
-    vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
-    vae.config.scaling_factor = 0.18215
-    vae.config.shift_factor = 0
-    unet, _ = UNet3DConditionModel.from_pretrained(
-        OmegaConf.to_container(config.model),
-        inference_ckpt_path,  # load checkpoint
-        device="cpu",
     )
-    unet = unet.to(dtype=torch.float16)
-    # set xformers
-    if is_xformers_available():
-        unet.enable_xformers_memory_efficient_attention()
-    pipeline = LipsyncPipeline(
-        vae=vae,
-        audio_encoder=audio_encoder,
-        unet=unet,
-        scheduler=scheduler,
-    ).to("cuda")
-    seed = -1
-    if seed != -1:
-        set_seed(seed)
-    else:
-        torch.seed()
-    print(f"Initial seed: {torch.initial_seed()}")
-    unique_id = str(uuid.uuid4())
-    video_out_path = f"video_out{unique_id}.mp4"
-    pipeline(
-        video_path=video_path,
-        audio_path=audio_path,
-        video_out_path=video_out_path,
-        video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
-        num_frames=config.data.num_frames,
-        num_inference_steps=config.run.inference_steps,
-        guidance_scale=1.0,
-        weight_dtype=torch.float16,
-        width=config.data.resolution,
-        height=config.data.resolution,
     )
-    if is_shared_ui:
-        # Clean up the temporary directory
-        if os.path.exists(temp_dir):
-            shutil.rmtree(temp_dir)
-            print(f"Temporary directory {temp_dir} deleted.")
-    return video_out_path
-css="""
-div#col-container{
-    margin: 0 auto;
-    max-width: 982px;
-}
-"""
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.Markdown("# LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync")
-        gr.Markdown("LatentSync, an end-to-end lip sync framework based on audio conditioned latent diffusion models without any intermediate motion representation, diverging from previous diffusion-based lip sync methods based on pixel space diffusion or two-stage generation.")
-        gr.HTML("""
-        <div style="display:flex;column-gap:4px;">
-            <a href="https://github.com/bytedance/LatentSync">
-                <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
-            </a>
-            <a href="https://arxiv.org/abs/2412.09262">
-                <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
-            </a>
-            <a href="https://huggingface.co/spaces/fffiloni/LatentSync?duplicate=true">
-                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
-            </a>
-            <a href="https://huggingface.co/fffiloni">
-                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-sm-dark.svg" alt="Follow me on HF">
-            </a>
-        </div>
-        """)
-        with gr.Row():
-            with gr.Column():
-                video_input = gr.Video(label="Video Control", format="mp4")
-                audio_input = gr.Audio(label="Audio Input", type="filepath")
-                submit_btn = gr.Button("Submit")
-            with gr.Column():
-                video_result = gr.Video(label="Result")
-                gr.Examples(
-                    examples = [
-                        ["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
-                        ["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
-                        ["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
-                    ],
-                    inputs = [video_input, audio_input]
                 )
-    submit_btn.click(
-        fn = main,
-        inputs = [video_input, audio_input],
-        outputs = [video_result]
     )
-demo.queue().launch(show_api=False, show_error=True)

 import gradio as gr
+from pathlib import Path
+from scripts.inference import main
+from omegaconf import OmegaConf
+import argparse
+from datetime import datetime
 # Download models
+from huggingface_hub import snapshot_download
+import os
 os.makedirs("checkpoints", exist_ok=True)
 snapshot_download(
     repo_id = "chunyu-li/LatentSync",
     local_dir = "./checkpoints"
 )
+CONFIG_PATH = Path("configs/unet/second_stage.yaml")
+CHECKPOINT_PATH = Path("checkpoints/latentsync_unet.pt")
+def process_video(
+    video_path,
+    audio_path,
+    guidance_scale,
+    inference_steps,
+    seed,
+):
+    # Create the temp directory if it doesn't exist
+    output_dir = Path("./temp")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Convert paths to absolute Path objects and normalize them
+    video_file_path = Path(video_path)
+    video_path = video_file_path.absolute().as_posix()
+    audio_path = Path(audio_path).absolute().as_posix()
+    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+    # Set the output path for the processed video
+    output_path = str(output_dir / f"{video_file_path.stem}_{current_time}.mp4")  # Change the filename as needed
+    config = OmegaConf.load(CONFIG_PATH)
+    config["run"].update(
+        {
+            "guidance_scale": guidance_scale,
+            "inference_steps": inference_steps,
+        }
+    )
+    # Parse the arguments
+    args = create_args(video_path, audio_path, output_path, inference_steps, guidance_scale, seed)
+    try:
+        result = main(
+            config=config,
+            args=args,
+        )
+        print("Processing completed successfully.")
+        return output_path  # Ensure the output path is returned
+    except Exception as e:
+        print(f"Error during processing: {str(e)}")
+        raise gr.Error(f"Error during processing: {str(e)}")
+def create_args(
+    video_path: str, audio_path: str, output_path: str, inference_steps: int, guidance_scale: float, seed: int
+) -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--inference_ckpt_path", type=str, required=True)
+    parser.add_argument("--video_path", type=str, required=True)
+    parser.add_argument("--audio_path", type=str, required=True)
+    parser.add_argument("--video_out_path", type=str, required=True)
+    parser.add_argument("--inference_steps", type=int, default=20)
+    parser.add_argument("--guidance_scale", type=float, default=1.0)
+    parser.add_argument("--seed", type=int, default=1247)
+    return parser.parse_args(
+        [
+            "--inference_ckpt_path",
+            CHECKPOINT_PATH.absolute().as_posix(),
+            "--video_path",
+            video_path,
+            "--audio_path",
+            audio_path,
+            "--video_out_path",
+            output_path,
+            "--inference_steps",
+            str(inference_steps),
+            "--guidance_scale",
+            str(guidance_scale),
+            "--seed",
+            str(seed),
+        ]
     )
+# Create Gradio interface
+with gr.Blocks(title="LatentSync Video Processing") as demo:
+    gr.Markdown(
+        """
+    # LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync
+    Upload a video and audio file to process with LatentSync model.
+    <div align="center">
+        <strong>Chunyu Li1,2  Chao Zhang1  Weikai Xu1  Jinghui Xie1,†  Weiguo Feng1
+        Bingyue Peng1  Weiwei Xing2,†</strong>
+    </div>
+    <div align="center">
+        <strong>1ByteDance   2Beijing Jiaotong University</strong>
+    </div>
+    <div style="display:flex;justify-content:center;column-gap:4px;">
+        <a href="https://github.com/bytedance/LatentSync">
+            <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
+        </a>
+        <a href="https://arxiv.org/pdf/2412.09262">
+            <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
+        </a>
+    </div>
+    """
     )
+    with gr.Row():
+        with gr.Column():
+            video_input = gr.Video(label="Input Video")
+            audio_input = gr.Audio(label="Input Audio", type="filepath")
+            with gr.Row():
+                guidance_scale = gr.Slider(
+                    minimum=1.0,
+                    maximum=3.5,
+                    value=1.5,
+                    step=0.5,
+                    label="Guidance Scale",
                 )
+                inference_steps = gr.Slider(minimum=10, maximum=50, value=20, step=1, label="Inference Steps")
+            with gr.Row():
+                seed = gr.Number(value=1247, label="Random Seed", precision=0)
+            process_btn = gr.Button("Process Video")
+        with gr.Column():
+            video_output = gr.Video(label="Output Video")
+            gr.Examples(
+                examples=[
+                    ["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
+                    ["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
+                    ["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
+                ],
+                inputs=[video_input, audio_input],
+            )
+    process_btn.click(
+        fn=process_video,
+        inputs=[
+            video_input,
+            audio_input,
+            guidance_scale,
+            inference_steps,
+            seed,
+        ],
+        outputs=video_output,
     )
+if __name__ == "__main__":
+    demo.launch(inbrowser=True, share=True)