wan-2-2-first-last-frame

Paused

App Files Files Community

rahul7star commited on 21 days ago

Commit

8dfdf4d

verified ·

1 Parent(s): 9ca902e

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -60

app.py CHANGED Viewed

@@ -35,22 +35,7 @@ hf_hub_download_local(repo_id="Kijai/WanVideo_comfy", filename="Wan22-Lightning/
 hf_hub_download_local(repo_id="Kijai/WanVideo_comfy", filename="Wan22-Lightning/Wan2.2-Lightning_I2V-A14B-4steps-lora_LOW_fp16.safetensors", local_dir="models/loras")
 print("Downloads complete.")
-LANDSCAPE_WIDTH = 832
-LANDSCAPE_HEIGHT = 480
-MAX_SEED = np.iinfo(np.int32).max
-FIXED_FPS = 16
-MIN_FRAMES_MODEL = 8
-MAX_FRAMES_MODEL = 81
-MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS,1)
-MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1)
 # --- Image Processing Functions ---
 def calculate_video_dimensions(width, height, max_size=832, min_size=480):
@@ -283,65 +268,142 @@ model_management.load_models_gpu([
     loader[0].patcher if hasattr(loader[0], 'patcher') else loader[0] for loader in model_loaders
 ])
 print("All models loaded successfully!")
-import time
-import gradio as gr
-import tempfile
-import torch
-import random
-import spaces
-# --- Dynamic GPU duration logic ---
-def get_duration(
-   start_image_pil,
-    end_image_pil,
-    prompt,
-    negative_prompt,
-    duration_seconds,
-    progress,
-):
-    # 15ms per step → just an example
-    calc_time = steps * 15
-    print(f"[GPU Duration Estimate] {calc_time} sec for {steps} steps")
-    return min(calc_time, 300)  # hard cap for safety
 # --- Main Video Generation Logic ---
-@spaces.GPU(duration=get_duration)
 def generate_video(
     start_image_pil,
     end_image_pil,
     prompt,
     negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走,过曝，",
-    duration_seconds=duration_seconds,
     progress=gr.Progress(track_tqdm=True)
 ):
     """
     The main function to generate a video based on user inputs.
     This function is called every time the user clicks the 'Generate' button.
     """
-    start_time = time.time()
     FPS = 16
-    duration = int(FPS * duration_seconds)  # convert seconds → frames
-    # --- Your existing video gen code continues here ---
-    # (I trimmed it for brevity, leave all nodes/patches/workflow unchanged)
-    # final save video logic...
-    elapsed = time.time() - start_time
-    print(f"[GPU Time Log] Video generated in {elapsed:.2f} sec")
-    return f"output/{save_result['ui']['images'][0]['filename']}"
-# --- Gradio UI ---
 css = '''
 .fillable{max-width: 1100px !important}
 .dark .progress-text {color: white}
 '''
 with gr.Blocks(theme=gr.themes.Citrus(), css=css) as app:
     gr.Markdown("# Wan 2.2 First/Last Frame Video Fast")
-    gr.Markdown("GPU time is dynamically calculated. Max video duration: **5 seconds**.")
     with gr.Row():
         with gr.Column():
             with gr.Group():
@@ -350,14 +412,14 @@ with gr.Blocks(theme=gr.themes.Citrus(), css=css) as app:
                     end_image = gr.Image(type="pil", label="End Frame")
                 prompt = gr.Textbox(label="Prompt", info="Describe the transition between the two images")
-                # Duration bar (1–5 seconds)
-                duration_seconds = gr.Slider(
-                    minimum=1, maximum=5, value=2, step=1,
-                    label="Video Duration (seconds)"
-                )
-                with gr.Accordion("Advanced Settings", open=False, visible=False):
                     negative_prompt = gr.Textbox(
                         label="Negative Prompt",
                         value="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走,过曝，",
@@ -371,7 +433,7 @@ with gr.Blocks(theme=gr.themes.Citrus(), css=css) as app:
     generate_button.click(
         fn=generate_video,
-        inputs=[start_image, end_image, prompt, negative_prompt, duration_seconds],
         outputs=output_video
     )
@@ -388,4 +450,4 @@ with gr.Blocks(theme=gr.themes.Citrus(), css=css) as app:
     )
 if __name__ == "__main__":
-    app.launch(share=True)

 hf_hub_download_local(repo_id="Kijai/WanVideo_comfy", filename="Wan22-Lightning/Wan2.2-Lightning_I2V-A14B-4steps-lora_LOW_fp16.safetensors", local_dir="models/loras")
 print("Downloads complete.")
+model_management.vram_state = model_management.VRAMState.HIGH_VRAM
 # --- Image Processing Functions ---
 def calculate_video_dimensions(width, height, max_size=832, min_size=480):
     loader[0].patcher if hasattr(loader[0], 'patcher') else loader[0] for loader in model_loaders
 ])
 print("All models loaded successfully!")
 # --- Main Video Generation Logic ---
+@spaces.GPU(duration=120)
 def generate_video(
     start_image_pil,
     end_image_pil,
     prompt,
     negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走,过曝，",
+    duration=33,
     progress=gr.Progress(track_tqdm=True)
 ):
     """
     The main function to generate a video based on user inputs.
     This function is called every time the user clicks the 'Generate' button.
     """
     FPS = 16
+    # Process images: resize and crop second image to match first
+    # The first image determines the dimensions
+    processed_start_image = start_image_pil.copy()
+    processed_end_image = resize_and_crop_to_match(end_image_pil, start_image_pil)
+    # Calculate video dimensions based on the first image
+    video_width, video_height = calculate_video_dimensions(
+        processed_start_image.width,
+        processed_start_image.height
+    )
+    print(f"Input image size: {processed_start_image.width}x{processed_start_image.height}")
+    print(f"Video dimensions: {video_width}x{video_height}")
+    clip = MODELS_AND_NODES["clip"]
+    vae = MODELS_AND_NODES["vae"]
+    model_low_noise = MODELS_AND_NODES["model_low_noise"]
+    model_high_noise = MODELS_AND_NODES["model_high_noise"]
+    clip_vision = MODELS_AND_NODES["clip_vision"]
+    cliptextencode = MODELS_AND_NODES["CLIPTextEncode"]
+    loadimage = MODELS_AND_NODES["LoadImage"]
+    clipvisionencode = MODELS_AND_NODES["CLIPVisionEncode"]
+    modelsamplingsd3 = MODELS_AND_NODES["ModelSamplingSD3"]
+    pathchsageattentionkj = MODELS_AND_NODES["PathchSageAttentionKJ"]
+    wanfirstlastframetovideo = MODELS_AND_NODES["WanFirstLastFrameToVideo"]
+    ksampleradvanced = MODELS_AND_NODES["KSamplerAdvanced"]
+    vaedecode = MODELS_AND_NODES["VAEDecode"]
+    createvideo = MODELS_AND_NODES["CreateVideo"]
+    savevideo = MODELS_AND_NODES["SaveVideo"]
+    # Save processed images to temporary files
+    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as start_file, \
+         tempfile.NamedTemporaryFile(suffix=".png", delete=False) as end_file:
+        processed_start_image.save(start_file.name)
+        processed_end_image.save(end_file.name)
+        start_image_path = start_file.name
+        end_image_path = end_file.name
+    with torch.inference_mode():
+        progress(0.1, desc="Encoding text and images...")
+        # --- Workflow execution ---
+        positive_conditioning = cliptextencode.encode(text=prompt, clip=get_value_at_index(clip, 0))
+        negative_conditioning = cliptextencode.encode(text=negative_prompt, clip=get_value_at_index(clip, 0))
+        start_image_loaded = loadimage.load_image(image=start_image_path)
+        end_image_loaded = loadimage.load_image(image=end_image_path)
+        clip_vision_encoded_start = clipvisionencode.encode(
+            crop="none", clip_vision=get_value_at_index(clip_vision, 0), image=get_value_at_index(start_image_loaded, 0)
+        )
+        clip_vision_encoded_end = clipvisionencode.encode(
+            crop="none", clip_vision=get_value_at_index(clip_vision, 0), image=get_value_at_index(end_image_loaded, 0)
+        )
+        progress(0.2, desc="Preparing initial latents...")
+        initial_latents = wanfirstlastframetovideo.EXECUTE_NORMALIZED(
+            width=video_width, height=video_height, length=duration, batch_size=1,
+            positive=get_value_at_index(positive_conditioning, 0),
+            negative=get_value_at_index(negative_conditioning, 0),
+            vae=get_value_at_index(vae, 0),
+            clip_vision_start_image=get_value_at_index(clip_vision_encoded_start, 0),
+            clip_vision_end_image=get_value_at_index(clip_vision_encoded_end, 0),
+            start_image=get_value_at_index(start_image_loaded, 0),
+            end_image=get_value_at_index(end_image_loaded, 0),
+        )
+        progress(0.3, desc="Patching models...")
+        model_low_patched = modelsamplingsd3.patch(shift=8, model=get_value_at_index(model_low_noise, 0))
+        model_low_final = pathchsageattentionkj.patch(sage_attention="auto", model=get_value_at_index(model_low_patched, 0))
+        model_high_patched = modelsamplingsd3.patch(shift=8, model=get_value_at_index(model_high_noise, 0))
+        model_high_final = pathchsageattentionkj.patch(sage_attention="auto", model=get_value_at_index(model_high_patched, 0))
+        progress(0.5, desc="Running KSampler (Step 1/2)...")
+        latent_step1 = ksampleradvanced.sample(
+            add_noise="enable", noise_seed=random.randint(1, 2**64), steps=8, cfg=1,
+            sampler_name="euler", scheduler="simple", start_at_step=0, end_at_step=4,
+            return_with_leftover_noise="enable", model=get_value_at_index(model_high_final, 0),
+            positive=get_value_at_index(initial_latents, 0),
+            negative=get_value_at_index(initial_latents, 1),
+            latent_image=get_value_at_index(initial_latents, 2),
+        )
+        progress(0.7, desc="Running KSampler (Step 2/2)...")
+        latent_step2 = ksampleradvanced.sample(
+            add_noise="disable", noise_seed=random.randint(1, 2**64), steps=8, cfg=1,
+            sampler_name="euler", scheduler="simple", start_at_step=4, end_at_step=10000,
+            return_with_leftover_noise="disable", model=get_value_at_index(model_low_final, 0),
+            positive=get_value_at_index(initial_latents, 0),
+            negative=get_value_at_index(initial_latents, 1),
+            latent_image=get_value_at_index(latent_step1, 0),
+        )
+        progress(0.8, desc="Decoding VAE...")
+        decoded_images = vaedecode.decode(samples=get_value_at_index(latent_step2, 0), vae=get_value_at_index(vae, 0))
+        progress(0.9, desc="Creating and saving video...")
+        video_data = createvideo.create_video(fps=FPS, images=get_value_at_index(decoded_images, 0))
+        # Save the video to ComfyUI's output directory
+        save_result = savevideo.save_video(
+            filename_prefix="GradioVideo", format="mp4", codec="h264",
+            video=get_value_at_index(video_data, 0),
+        )
+        progress(1.0, desc="Done!")
+        return f"output/{save_result['ui']['images'][0]['filename']}"
 css = '''
 .fillable{max-width: 1100px !important}
 .dark .progress-text {color: white}
 '''
 with gr.Blocks(theme=gr.themes.Citrus(), css=css) as app:
     gr.Markdown("# Wan 2.2 First/Last Frame Video Fast")
+    gr.Markdown("Running the [Wan 2.2 First/Last Frame ComfyUI workflow](https://www.reddit.com/r/StableDiffusion/comments/1me4306/psa_wan_22_does_first_frame_last_frame_out_of_the/) and the [lightx2v/Wan2.2-Lightning](https://huggingface.co/lightx2v/Wan2.2-Lightning) 8-step LoRA on ZeroGPU")
     with gr.Row():
         with gr.Column():
             with gr.Group():
                     end_image = gr.Image(type="pil", label="End Frame")
                 prompt = gr.Textbox(label="Prompt", info="Describe the transition between the two images")
+                with gr.Accordion("Advanced Settings", open=False, visible=True):
+                    duration = gr.Radio(
+                        [("Short (2s)", 33), ("Mid (4s)", 66)],
+                        value=33,
+                        label="Video Duration",
+                        visible=False
+                    )
                     negative_prompt = gr.Textbox(
                         label="Negative Prompt",
                         value="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走,过曝，",
     generate_button.click(
         fn=generate_video,
+        inputs=[start_image, end_image, prompt, negative_prompt, duration],
         outputs=output_video
     )
     )
 if __name__ == "__main__":
+    app.launch(share=True)