Spaces:

pillaryao
/

demo

Running

App Files Files Community

pillaryao commited on Feb 7

Commit

b207b64

verified ·

1 Parent(s): 3c524e6

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.github/workflows/update_space.yml +28 -0
README.md +1 -1
assets/demo1_video.mp4 +2 -2
assets/demo2_video.mp4 +2 -2
assets/demo3_video.mp4 +2 -2
gradio_app.py +210 -151

.github/workflows/update_space.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: Run Python script
+on:
+  push:
+    branches:
+      - dev
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9'
+    - name: Install Gradio
+      run: python -m pip install gradio
+    - name: Log in to Hugging Face
+      run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
+    - name: Deploy to Spaces
+      run: gradio deploy

README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 title: demo
 app_file: gradio_app.py
 sdk: gradio
-sdk_version: 5.12.0
 ---
 # LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync

 title: demo
 app_file: gradio_app.py
 sdk: gradio
+sdk_version: 5.14.0
 ---
 # LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync

assets/demo1_video.mp4 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:092ff3cc8d8bc60490cfd4632745ea80ddb80e6e53e73fb9158aa9cdc6dee585
-size 2278221

 version https://git-lfs.github.com/spec/v1
+oid sha256:ed2dd1e2001aa605c3f2d77672a8af4ed55e427a85c55d408adfc3d5076bc872
+size 1240008

assets/demo2_video.mp4 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b275b97fea803c888da66c9a9815492edfb4545f2a7a85263946f11a6a4ae7b6
-size 3688891

 version https://git-lfs.github.com/spec/v1
+oid sha256:8c3f10288e0642e587a95c0040e6966f8f6b7e003c3a17b572f72472b896d8ff
+size 1772492

assets/demo3_video.mp4 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fee4988f803bea01deaaa8983a1ff42f6a83e30497ae53acd7decefb57de7a7d
-size 4710464

 version https://git-lfs.github.com/spec/v1
+oid sha256:cfa177b2a44f7809f606285c120e270d526caa50d708ec95e0f614d220970e0f
+size 2112370

gradio_app.py CHANGED Viewed

@@ -1,160 +1,219 @@
 import gradio as gr
-from pathlib import Path
-from scripts.inference import main
-from omegaconf import OmegaConf
-import argparse
-from datetime import datetime
-CONFIG_PATH = Path("configs/unet/second_stage.yaml")
-CHECKPOINT_PATH = Path("checkpoints/latentsync_unet.pt")
-def process_video(
-    video_path,
-    audio_path,
-    guidance_scale,
-    inference_steps,
-    seed,
-):
-    # Create the temp directory if it doesn't exist
-    output_dir = Path("./temp")
-    output_dir.mkdir(parents=True, exist_ok=True)
-    # Convert paths to absolute Path objects and normalize them
-    video_file_path = Path(video_path)
-    video_path = video_file_path.absolute().as_posix()
-    audio_path = Path(audio_path).absolute().as_posix()
-    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
-    # Set the output path for the processed video
-    output_path = str(output_dir / f"{video_file_path.stem}_{current_time}.mp4")  # Change the filename as needed
-    config = OmegaConf.load(CONFIG_PATH)
-    config["run"].update(
-        {
-            "guidance_scale": guidance_scale,
-            "inference_steps": inference_steps,
-        }
-    )
-    # Parse the arguments
-    args = create_args(video_path, audio_path, output_path, inference_steps, guidance_scale, seed)
-    try:
-        result = main(
-            config=config,
-            args=args,
-        )
-        print("Processing completed successfully.")
-        return output_path  # Ensure the output path is returned
-    except Exception as e:
-        print(f"Error during processing: {str(e)}")
-        raise gr.Error(f"Error during processing: {str(e)}")
-def create_args(
-    video_path: str, audio_path: str, output_path: str, inference_steps: int, guidance_scale: float, seed: int
-) -> argparse.Namespace:
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--inference_ckpt_path", type=str, required=True)
-    parser.add_argument("--video_path", type=str, required=True)
-    parser.add_argument("--audio_path", type=str, required=True)
-    parser.add_argument("--video_out_path", type=str, required=True)
-    parser.add_argument("--inference_steps", type=int, default=20)
-    parser.add_argument("--guidance_scale", type=float, default=1.0)
-    parser.add_argument("--seed", type=int, default=1247)
-    return parser.parse_args(
-        [
-            "--inference_ckpt_path",
-            CHECKPOINT_PATH.absolute().as_posix(),
-            "--video_path",
-            video_path,
-            "--audio_path",
-            audio_path,
-            "--video_out_path",
-            output_path,
-            "--inference_steps",
-            str(inference_steps),
-            "--guidance_scale",
-            str(guidance_scale),
-            "--seed",
-            str(seed),
-        ]
     )
-# Create Gradio interface
-with gr.Blocks(title="LatentSync Video Processing") as demo:
-    gr.Markdown(
-        """
-    # LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync
-    Upload a video and audio file to process with LatentSync model.
-    <div align="center">
-        <strong>Chunyu Li1,2  Chao Zhang1  Weikai Xu1  Jinghui Xie1,†  Weiguo Feng1
-        Bingyue Peng1  Weiwei Xing2,†</strong>
-    </div>
-    <div align="center">
-        <strong>1ByteDance   2Beijing Jiaotong University</strong>
-    </div>
-    <div style="display:flex;justify-content:center;column-gap:4px;">
-        <a href="https://github.com/bytedance/LatentSync">
-            <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
-        </a>
-        <a href="https://arxiv.org/pdf/2412.09262">
-            <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
-        </a>
-    </div>
-    """
     )
-    with gr.Row():
-        with gr.Column():
-            video_input = gr.Video(label="Input Video")
-            audio_input = gr.Audio(label="Input Audio", type="filepath")
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    minimum=1.0,
-                    maximum=3.5,
-                    value=1.5,
-                    step=0.5,
-                    label="Guidance Scale",
                 )
-                inference_steps = gr.Slider(minimum=10, maximum=50, value=20, step=1, label="Inference Steps")
-            with gr.Row():
-                seed = gr.Number(value=1247, label="Random Seed", precision=0)
-            process_btn = gr.Button("Process Video")
-        with gr.Column():
-            video_output = gr.Video(label="Output Video")
-            gr.Examples(
-                examples=[
-                    ["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
-                    ["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
-                    ["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
-                ],
-                inputs=[video_input, audio_input],
-            )
-    process_btn.click(
-        fn=process_video,
-        inputs=[
-            video_input,
-            audio_input,
-            guidance_scale,
-            inference_steps,
-            seed,
-        ],
-        outputs=video_output,
     )
-if __name__ == "__main__":
-    demo.launch(inbrowser=True, share=True)

 import gradio as gr
+import os
+import sys
+import shutil
+import uuid
+import subprocess
+from glob import glob
+from huggingface_hub import snapshot_download
+# Download models
+os.makedirs("checkpoints", exist_ok=True)
+snapshot_download(
+    repo_id = "chunyu-li/LatentSync",
+    local_dir = "./checkpoints"
+)
+import tempfile
+from moviepy.editor import VideoFileClip
+from pydub import AudioSegment
+def process_video(input_video_path, temp_dir="temp_dir"):
+    """
+    Crop a given MP4 video to a maximum duration of 10 seconds if it is longer than 10 seconds.
+    Save the new video in the specified folder (default is temp_dir).
+    Args:
+        input_video_path (str): Path to the input video file.
+        temp_dir (str): Directory where the processed video will be saved.
+    Returns:
+        str: Path to the cropped video file.
+    """
+    # Ensure the temp_dir exists
+    os.makedirs(temp_dir, exist_ok=True)
+    # Load the video
+    video = VideoFileClip(input_video_path)
+    # Determine the output path
+    input_file_name = os.path.basename(input_video_path)
+    output_video_path = os.path.join(temp_dir, f"cropped_{input_file_name}")
+    # Crop the video to 10 seconds if necessary
+    if video.duration > 10:
+        video = video.subclip(0, 10)
+    # Write the cropped video to the output path
+    video.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
+    # Return the path to the cropped video
+    return output_video_path
+def process_audio(file_path, temp_dir):
+    # Load the audio file
+    audio = AudioSegment.from_file(file_path)
+    # Check and cut the audio if longer than 4 seconds
+    max_duration = 8 * 1000  # 4 seconds in milliseconds
+    if len(audio) > max_duration:
+        audio = audio[:max_duration]
+    # Save the processed audio in the temporary directory
+    output_path = os.path.join(temp_dir, "trimmed_audio.wav")
+    audio.export(output_path, format="wav")
+    # Return the path to the trimmed file
+    print(f"Processed audio saved at: {output_path}")
+    return output_path
+import argparse
+from omegaconf import OmegaConf
+import torch
+from diffusers import AutoencoderKL, DDIMScheduler
+from latentsync.models.unet import UNet3DConditionModel
+from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
+from diffusers.utils.import_utils import is_xformers_available
+from accelerate.utils import set_seed
+from latentsync.whisper.audio2feature import Audio2Feature
+def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
+    inference_ckpt_path = "checkpoints/latentsync_unet.pt"
+    unet_config_path = "configs/unet/second_stage.yaml"
+    config = OmegaConf.load(unet_config_path)
+    print(f"Input video path: {video_path}")
+    print(f"Input audio path: {audio_path}")
+    print(f"Loaded checkpoint path: {inference_ckpt_path}")
+    is_shared_ui = True if "fffiloni/LatentSync" in os.environ['SPACE_ID'] else False
+    temp_dir = None
+    if is_shared_ui:
+        temp_dir = tempfile.mkdtemp()
+        cropped_video_path = process_video(video_path)
+        print(f"Cropped video saved to: {cropped_video_path}")
+        video_path=cropped_video_path
+        trimmed_audio_path = process_audio(audio_path, temp_dir)
+        print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
+        audio_path=trimmed_audio_path
+    scheduler = DDIMScheduler.from_pretrained("configs")
+    if config.model.cross_attention_dim == 768:
+        whisper_model_path = "checkpoints/whisper/small.pt"
+    elif config.model.cross_attention_dim == 384:
+        whisper_model_path = "checkpoints/whisper/tiny.pt"
+    else:
+        raise NotImplementedError("cross_attention_dim must be 768 or 384")
+    audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
+    vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
+    vae.config.scaling_factor = 0.18215
+    vae.config.shift_factor = 0
+    unet, _ = UNet3DConditionModel.from_pretrained(
+        OmegaConf.to_container(config.model),
+        inference_ckpt_path,  # load checkpoint
+        device="cpu",
     )
+    unet = unet.to(dtype=torch.float16)
+    # set xformers
+    if is_xformers_available():
+        unet.enable_xformers_memory_efficient_attention()
+    pipeline = LipsyncPipeline(
+        vae=vae,
+        audio_encoder=audio_encoder,
+        unet=unet,
+        scheduler=scheduler,
+    ).to("cuda")
+    seed = -1
+    if seed != -1:
+        set_seed(seed)
+    else:
+        torch.seed()
+    print(f"Initial seed: {torch.initial_seed()}")
+    unique_id = str(uuid.uuid4())
+    video_out_path = f"video_out{unique_id}.mp4"
+    pipeline(
+        video_path=video_path,
+        audio_path=audio_path,
+        video_out_path=video_out_path,
+        video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
+        num_frames=config.data.num_frames,
+        num_inference_steps=config.run.inference_steps,
+        guidance_scale=1.0,
+        weight_dtype=torch.float16,
+        width=config.data.resolution,
+        height=config.data.resolution,
     )
+    if is_shared_ui:
+        # Clean up the temporary directory
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+            print(f"Temporary directory {temp_dir} deleted.")
+    return video_out_path
+css="""
+div#col-container{
+    margin: 0 auto;
+    max-width: 982px;
+}
+"""
+with gr.Blocks(css=css) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("# LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync")
+        gr.Markdown("LatentSync, an end-to-end lip sync framework based on audio conditioned latent diffusion models without any intermediate motion representation, diverging from previous diffusion-based lip sync methods based on pixel space diffusion or two-stage generation.")
+        gr.HTML("""
+        <div style="display:flex;column-gap:4px;">
+            <a href="https://github.com/bytedance/LatentSync">
+                <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
+            </a>
+            <a href="https://arxiv.org/abs/2412.09262">
+                <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
+            </a>
+            <a href="https://huggingface.co/spaces/fffiloni/LatentSync?duplicate=true">
+                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
+            </a>
+            <a href="https://huggingface.co/fffiloni">
+                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-sm-dark.svg" alt="Follow me on HF">
+            </a>
+        </div>
+        """)
+        with gr.Row():
+            with gr.Column():
+                video_input = gr.Video(label="Video Control", format="mp4")
+                audio_input = gr.Audio(label="Audio Input", type="filepath")
+                submit_btn = gr.Button("Submit")
+            with gr.Column():
+                video_result = gr.Video(label="Result")
+                gr.Examples(
+                    examples = [
+                        ["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
+                        ["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
+                        ["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
+                    ],
+                    inputs = [video_input, audio_input]
                 )
+    submit_btn.click(
+        fn = main,
+        inputs = [video_input, audio_input],
+        outputs = [video_result]
     )
+demo.queue().launch(show_api=False, show_error=True)