Spaces:

multimodalart
/

wan2-1-fast

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 18 days ago

Commit

12d6cf5

verified ·

1 Parent(s): a2858d9

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -31

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import spaces # Assuming this is for Hugging Face Spaces GPU decorator
 from huggingface_hub import hf_hub_download
 import logging
 import numpy as np
-from PIL import Image # Added for type hinting
 # --- Global Model Loading & LoRA Handling ---
 MODEL_ID = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
@@ -57,22 +57,22 @@ pipe.load_lora_weights(causvid_path, adapter_name="causvid_lora")
 logger.info("Setting LoRA adapter...")
 pipe.set_adapters(["causvid_lora"], adapter_weights=[1.0])
-# MOD_VALUE for height/width constraints
-# From WanImageToVideoPipeline docs: height/width must be multiple of vae_scale_factor * transformer.config.patch_size[1 or 2]
-MOD_VALUE = pipe.vae_scale_factor * pipe.transformer.config.patch_size[1] # e.g., 8 * 16 = 128
-logger.info(f"Derived MOD_VALUE for dimensions: {MOD_VALUE}")
-# --- Helper functions and constants for automatic dimension adjustment ---
-# These constants must match the Gradio slider definitions below
-DEFAULT_H_SLIDER_VALUE = 384
-DEFAULT_W_SLIDER_VALUE = 640
 DEFAULT_TARGET_AREA = float(DEFAULT_H_SLIDER_VALUE * DEFAULT_W_SLIDER_VALUE)
 SLIDER_MIN_H = 128
-SLIDER_MAX_H = 512
 SLIDER_MIN_W = 128
-SLIDER_MAX_W = 1024
 def _calculate_new_dimensions_wan(pil_image: Image.Image, mod_val: int, target_area: float,
                                  min_h: int, max_h: int, min_w: int, max_w: int,
@@ -85,19 +85,15 @@ def _calculate_new_dimensions_wan(pil_image: Image.Image, mod_val: int, target_a
     aspect_ratio = orig_h / orig_w
-    # Calculate ideal dimensions for the target area, maintaining aspect ratio
     ideal_h = np.sqrt(target_area * aspect_ratio)
     ideal_w = np.sqrt(target_area / aspect_ratio)
-    # Round to nearest multiple of mod_val
     calc_h = round(ideal_h / mod_val) * mod_val
     calc_w = round(ideal_w / mod_val) * mod_val
-    # Ensure dimensions are at least mod_val (smallest valid multiple)
-    calc_h = mod_val if calc_h == 0 else calc_h
-    calc_w = mod_val if calc_w == 0 else calc_w
-    # Clamp to slider limits
     new_h = int(np.clip(calc_h, min_h, max_h))
     new_w = int(np.clip(calc_w, min_w, max_w))
@@ -108,14 +104,13 @@ def _calculate_new_dimensions_wan(pil_image: Image.Image, mod_val: int, target_a
     return new_h, new_w
 def handle_image_upload_for_dims_wan(uploaded_pil_image: Image.Image | None, current_h_val: int, current_w_val: int):
-    if uploaded_pil_image is None: # Image cleared by user
         logger.info("Image cleared. Resetting dimensions to default slider values.")
         return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
     try:
         new_h, new_w = _calculate_new_dimensions_wan(
             uploaded_pil_image,
-            MOD_VALUE,
             DEFAULT_TARGET_AREA,
             SLIDER_MIN_H, SLIDER_MAX_H,
             SLIDER_MIN_W, SLIDER_MAX_W,
@@ -124,7 +119,6 @@ def handle_image_upload_for_dims_wan(uploaded_pil_image: Image.Image | None, cur
         return gr.update(value=new_h), gr.update(value=new_w)
     except Exception as e:
         logger.error(f"Error auto-adjusting H/W from image: {e}", exc_info=True)
-        # On error, revert to defaults or keep current. Defaults are safer.
         return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
@@ -145,7 +139,6 @@ def generate_video(input_image: Image.Image, prompt: str, negative_prompt: str,
     logger.info(f"  Num Frames: {num_frames}, FPS for conditioning & export: {fps_for_conditioning_and_export}")
     logger.info(f"  Guidance Scale: {guidance_scale}, Steps: {steps}")
     target_height = int(height)
     target_width = int(width)
     num_frames = int(num_frames)
@@ -153,11 +146,21 @@ def generate_video(input_image: Image.Image, prompt: str, negative_prompt: str,
     guidance_scale_val = float(guidance_scale)
     steps_val = int(steps)
-    # Resize the input PIL image to the target dimensions for the pipeline
     resized_image = input_image.resize((target_width, target_height))
     logger.info(f"  Input image resized to: {resized_image.size} for pipeline input.")
     with torch.inference_mode():
         output_frames_list = pipe(
             image=resized_image,
@@ -168,14 +171,14 @@ def generate_video(input_image: Image.Image, prompt: str, negative_prompt: str,
             num_frames=num_frames,
             guidance_scale=guidance_scale_val,
             num_inference_steps=steps_val,
-            fps=fps_val, # For conditioning
-            generator=torch.Generator(device="cuda").manual_seed(0) # For reproducibility
         ).frames[0]
     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
         video_path = tmpfile.name
-    export_to_video(output_frames_list, video_path, fps=fps_val) # For export
     logger.info(f"Video successfully generated and saved to {video_path}")
     return video_path
@@ -184,7 +187,6 @@ default_prompt_i2v = "make this image come alive, cinematic motion, smooth anima
 default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
 penguin_image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
 with gr.Blocks() as demo:
     gr.Markdown(f"""
     # Image-to-Video with Wan 2.1 I2V (14B) + CausVid LoRA
@@ -205,6 +207,7 @@ with gr.Blocks() as demo:
                     lines=3
                 )
                 with gr.Row():
                     height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
                     width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
                 with gr.Row():
@@ -218,10 +221,9 @@ with gr.Blocks() as demo:
         with gr.Column(scale=3):
             video_output = gr.Video(label="Generated Video", interactive=False)
-    # Event handler for image upload/clear to adjust H/W sliders
     input_image_component.change(
         fn=handle_image_upload_for_dims_wan,
-        inputs=[input_image_component, height_input, width_input], # Pass current slider values
         outputs=[height_input, width_input]
     )
@@ -245,7 +247,7 @@ with gr.Blocks() as demo:
     gr.Examples(
         examples=[
-            [penguin_image_url, "a penguin playfully dancing in the snow, Antarctica", default_negative_prompt, DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, 25, 1.0, 4, 16]
         ],
         inputs=inputs_for_click_and_examples,
         outputs=video_output,

 from huggingface_hub import hf_hub_download
 import logging
 import numpy as np
+from PIL import Image
 # --- Global Model Loading & LoRA Handling ---
 MODEL_ID = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
 logger.info("Setting LoRA adapter...")
 pipe.set_adapters(["causvid_lora"], adapter_weights=[1.0])
+MOD_VALUE = 128
+logger.info(f"VAE spatial scale factor: {vae_spatial_scale_factor}")
+logger.info(f"Transformer patch HxW: {transformer_patch_height}x{transformer_patch_width}")
+logger.info(f"Calculated MOD_VALUE for H step: {MOD_VALUE_H}, for W step: {MOD_VALUE_W}")
+logger.info(f"Using general MOD_VALUE for slider steps: {MOD_VALUE}")
+DEFAULT_H_SLIDER_VALUE = 384 # (3 * 128)
+DEFAULT_W_SLIDER_VALUE = 640 # (5 * 128)
 DEFAULT_TARGET_AREA = float(DEFAULT_H_SLIDER_VALUE * DEFAULT_W_SLIDER_VALUE)
 SLIDER_MIN_H = 128
+SLIDER_MAX_H = 720
 SLIDER_MIN_W = 128
+SLIDER_MAX_W = 1280
 def _calculate_new_dimensions_wan(pil_image: Image.Image, mod_val: int, target_area: float,
                                  min_h: int, max_h: int, min_w: int, max_w: int,
     aspect_ratio = orig_h / orig_w
     ideal_h = np.sqrt(target_area * aspect_ratio)
     ideal_w = np.sqrt(target_area / aspect_ratio)
     calc_h = round(ideal_h / mod_val) * mod_val
     calc_w = round(ideal_w / mod_val) * mod_val
+    calc_h = mod_val if calc_h < mod_val else calc_h # Ensure at least one mod_val unit
+    calc_w = mod_val if calc_w < mod_val else calc_w # Ensure at least one mod_val unit
     new_h = int(np.clip(calc_h, min_h, max_h))
     new_w = int(np.clip(calc_w, min_w, max_w))
     return new_h, new_w
 def handle_image_upload_for_dims_wan(uploaded_pil_image: Image.Image | None, current_h_val: int, current_w_val: int):
+    if uploaded_pil_image is None:
         logger.info("Image cleared. Resetting dimensions to default slider values.")
         return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
     try:
         new_h, new_w = _calculate_new_dimensions_wan(
             uploaded_pil_image,
+            MOD_VALUE, # Use the globally determined MOD_VALUE
             DEFAULT_TARGET_AREA,
             SLIDER_MIN_H, SLIDER_MAX_H,
             SLIDER_MIN_W, SLIDER_MAX_W,
         return gr.update(value=new_h), gr.update(value=new_w)
     except Exception as e:
         logger.error(f"Error auto-adjusting H/W from image: {e}", exc_info=True)
         return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
     logger.info(f"  Num Frames: {num_frames}, FPS for conditioning & export: {fps_for_conditioning_and_export}")
     logger.info(f"  Guidance Scale: {guidance_scale}, Steps: {steps}")
     target_height = int(height)
     target_width = int(width)
     num_frames = int(num_frames)
     guidance_scale_val = float(guidance_scale)
     steps_val = int(steps)
+    # Ensure dimensions are compatible (already handled by slider steps and auto-adjustment)
+    if target_height % MOD_VALUE_H != 0:
+        logger.warning(f"Height {target_height} is not a multiple of {MOD_VALUE_H}. Adjusting...")
+        target_height = (target_height // MOD_VALUE_H) * MOD_VALUE_H
+    if target_width % MOD_VALUE_W != 0:
+        logger.warning(f"Width {target_width} is not a multiple of {MOD_VALUE_W}. Adjusting...")
+        target_width = (target_width // MOD_VALUE_W) * MOD_VALUE_W
+    target_height = max(MOD_VALUE_H, target_height) # Ensure minimum size
+    target_width = max(MOD_VALUE_W, target_width)   # Ensure minimum size
     resized_image = input_image.resize((target_width, target_height))
     logger.info(f"  Input image resized to: {resized_image.size} for pipeline input.")
     with torch.inference_mode():
         output_frames_list = pipe(
             image=resized_image,
             num_frames=num_frames,
             guidance_scale=guidance_scale_val,
             num_inference_steps=steps_val,
+            fps=fps_val,
+            generator=torch.Generator(device="cuda").manual_seed(0)
         ).frames[0]
     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
         video_path = tmpfile.name
+    export_to_video(output_frames_list, video_path, fps=fps_val)
     logger.info(f"Video successfully generated and saved to {video_path}")
     return video_path
 default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
 penguin_image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
 with gr.Blocks() as demo:
     gr.Markdown(f"""
     # Image-to-Video with Wan 2.1 I2V (14B) + CausVid LoRA
                     lines=3
                 )
                 with gr.Row():
+                    # Sliders use the general MOD_VALUE since MOD_VALUE_H and MOD_VALUE_W are the same here
                     height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
                     width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
                 with gr.Row():
         with gr.Column(scale=3):
             video_output = gr.Video(label="Generated Video", interactive=False)
     input_image_component.change(
         fn=handle_image_upload_for_dims_wan,
+        inputs=[input_image_component, height_input, width_input],
         outputs=[height_input, width_input]
     )
     gr.Examples(
         examples=[
+            [penguin_image_url, "a penguin playfully dancing in the snow, Antarctica", default_negative_prompt, DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, 25, 1.0, 4, 16],
         ],
         inputs=inputs_for_click_and_examples,
         outputs=video_output,