Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -9,7 +9,7 @@ import spaces # Assuming this is for Hugging Face Spaces GPU decorator
|
|
9 |
from huggingface_hub import hf_hub_download
|
10 |
import logging
|
11 |
import numpy as np
|
12 |
-
from PIL import Image
|
13 |
|
14 |
# --- Global Model Loading & LoRA Handling ---
|
15 |
MODEL_ID = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
|
@@ -57,22 +57,22 @@ pipe.load_lora_weights(causvid_path, adapter_name="causvid_lora")
|
|
57 |
logger.info("Setting LoRA adapter...")
|
58 |
pipe.set_adapters(["causvid_lora"], adapter_weights=[1.0])
|
59 |
|
60 |
-
|
61 |
-
# From WanImageToVideoPipeline docs: height/width must be multiple of vae_scale_factor * transformer.config.patch_size[1 or 2]
|
62 |
-
MOD_VALUE = pipe.vae_scale_factor * pipe.transformer.config.patch_size[1] # e.g., 8 * 16 = 128
|
63 |
-
logger.info(f"Derived MOD_VALUE for dimensions: {MOD_VALUE}")
|
64 |
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
DEFAULT_W_SLIDER_VALUE = 640
|
70 |
DEFAULT_TARGET_AREA = float(DEFAULT_H_SLIDER_VALUE * DEFAULT_W_SLIDER_VALUE)
|
71 |
|
72 |
SLIDER_MIN_H = 128
|
73 |
-
SLIDER_MAX_H =
|
74 |
SLIDER_MIN_W = 128
|
75 |
-
SLIDER_MAX_W =
|
76 |
|
77 |
def _calculate_new_dimensions_wan(pil_image: Image.Image, mod_val: int, target_area: float,
|
78 |
min_h: int, max_h: int, min_w: int, max_w: int,
|
@@ -85,19 +85,15 @@ def _calculate_new_dimensions_wan(pil_image: Image.Image, mod_val: int, target_a
|
|
85 |
|
86 |
aspect_ratio = orig_h / orig_w
|
87 |
|
88 |
-
# Calculate ideal dimensions for the target area, maintaining aspect ratio
|
89 |
ideal_h = np.sqrt(target_area * aspect_ratio)
|
90 |
ideal_w = np.sqrt(target_area / aspect_ratio)
|
91 |
|
92 |
-
# Round to nearest multiple of mod_val
|
93 |
calc_h = round(ideal_h / mod_val) * mod_val
|
94 |
calc_w = round(ideal_w / mod_val) * mod_val
|
95 |
|
96 |
-
# Ensure
|
97 |
-
|
98 |
-
calc_w = mod_val if calc_w == 0 else calc_w
|
99 |
|
100 |
-
# Clamp to slider limits
|
101 |
new_h = int(np.clip(calc_h, min_h, max_h))
|
102 |
new_w = int(np.clip(calc_w, min_w, max_w))
|
103 |
|
@@ -108,14 +104,13 @@ def _calculate_new_dimensions_wan(pil_image: Image.Image, mod_val: int, target_a
|
|
108 |
return new_h, new_w
|
109 |
|
110 |
def handle_image_upload_for_dims_wan(uploaded_pil_image: Image.Image | None, current_h_val: int, current_w_val: int):
|
111 |
-
if uploaded_pil_image is None:
|
112 |
logger.info("Image cleared. Resetting dimensions to default slider values.")
|
113 |
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
|
114 |
-
|
115 |
try:
|
116 |
new_h, new_w = _calculate_new_dimensions_wan(
|
117 |
uploaded_pil_image,
|
118 |
-
MOD_VALUE,
|
119 |
DEFAULT_TARGET_AREA,
|
120 |
SLIDER_MIN_H, SLIDER_MAX_H,
|
121 |
SLIDER_MIN_W, SLIDER_MAX_W,
|
@@ -124,7 +119,6 @@ def handle_image_upload_for_dims_wan(uploaded_pil_image: Image.Image | None, cur
|
|
124 |
return gr.update(value=new_h), gr.update(value=new_w)
|
125 |
except Exception as e:
|
126 |
logger.error(f"Error auto-adjusting H/W from image: {e}", exc_info=True)
|
127 |
-
# On error, revert to defaults or keep current. Defaults are safer.
|
128 |
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
|
129 |
|
130 |
|
@@ -145,7 +139,6 @@ def generate_video(input_image: Image.Image, prompt: str, negative_prompt: str,
|
|
145 |
logger.info(f" Num Frames: {num_frames}, FPS for conditioning & export: {fps_for_conditioning_and_export}")
|
146 |
logger.info(f" Guidance Scale: {guidance_scale}, Steps: {steps}")
|
147 |
|
148 |
-
|
149 |
target_height = int(height)
|
150 |
target_width = int(width)
|
151 |
num_frames = int(num_frames)
|
@@ -153,11 +146,21 @@ def generate_video(input_image: Image.Image, prompt: str, negative_prompt: str,
|
|
153 |
guidance_scale_val = float(guidance_scale)
|
154 |
steps_val = int(steps)
|
155 |
|
156 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
resized_image = input_image.resize((target_width, target_height))
|
158 |
logger.info(f" Input image resized to: {resized_image.size} for pipeline input.")
|
159 |
|
160 |
-
|
161 |
with torch.inference_mode():
|
162 |
output_frames_list = pipe(
|
163 |
image=resized_image,
|
@@ -168,14 +171,14 @@ def generate_video(input_image: Image.Image, prompt: str, negative_prompt: str,
|
|
168 |
num_frames=num_frames,
|
169 |
guidance_scale=guidance_scale_val,
|
170 |
num_inference_steps=steps_val,
|
171 |
-
fps=fps_val,
|
172 |
-
generator=torch.Generator(device="cuda").manual_seed(0)
|
173 |
).frames[0]
|
174 |
|
175 |
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
|
176 |
video_path = tmpfile.name
|
177 |
|
178 |
-
export_to_video(output_frames_list, video_path, fps=fps_val)
|
179 |
logger.info(f"Video successfully generated and saved to {video_path}")
|
180 |
return video_path
|
181 |
|
@@ -184,7 +187,6 @@ default_prompt_i2v = "make this image come alive, cinematic motion, smooth anima
|
|
184 |
default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
|
185 |
penguin_image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
|
186 |
|
187 |
-
|
188 |
with gr.Blocks() as demo:
|
189 |
gr.Markdown(f"""
|
190 |
# Image-to-Video with Wan 2.1 I2V (14B) + CausVid LoRA
|
@@ -205,6 +207,7 @@ with gr.Blocks() as demo:
|
|
205 |
lines=3
|
206 |
)
|
207 |
with gr.Row():
|
|
|
208 |
height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
|
209 |
width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
|
210 |
with gr.Row():
|
@@ -218,10 +221,9 @@ with gr.Blocks() as demo:
|
|
218 |
with gr.Column(scale=3):
|
219 |
video_output = gr.Video(label="Generated Video", interactive=False)
|
220 |
|
221 |
-
# Event handler for image upload/clear to adjust H/W sliders
|
222 |
input_image_component.change(
|
223 |
fn=handle_image_upload_for_dims_wan,
|
224 |
-
inputs=[input_image_component, height_input, width_input],
|
225 |
outputs=[height_input, width_input]
|
226 |
)
|
227 |
|
@@ -245,7 +247,7 @@ with gr.Blocks() as demo:
|
|
245 |
|
246 |
gr.Examples(
|
247 |
examples=[
|
248 |
-
[penguin_image_url, "a penguin playfully dancing in the snow, Antarctica", default_negative_prompt, DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, 25, 1.0, 4, 16]
|
249 |
],
|
250 |
inputs=inputs_for_click_and_examples,
|
251 |
outputs=video_output,
|
|
|
9 |
from huggingface_hub import hf_hub_download
|
10 |
import logging
|
11 |
import numpy as np
|
12 |
+
from PIL import Image
|
13 |
|
14 |
# --- Global Model Loading & LoRA Handling ---
|
15 |
MODEL_ID = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
|
|
|
57 |
logger.info("Setting LoRA adapter...")
|
58 |
pipe.set_adapters(["causvid_lora"], adapter_weights=[1.0])
|
59 |
|
60 |
+
MOD_VALUE = 128
|
|
|
|
|
|
|
61 |
|
62 |
+
logger.info(f"VAE spatial scale factor: {vae_spatial_scale_factor}")
|
63 |
+
logger.info(f"Transformer patch HxW: {transformer_patch_height}x{transformer_patch_width}")
|
64 |
+
logger.info(f"Calculated MOD_VALUE for H step: {MOD_VALUE_H}, for W step: {MOD_VALUE_W}")
|
65 |
+
logger.info(f"Using general MOD_VALUE for slider steps: {MOD_VALUE}")
|
66 |
|
67 |
+
|
68 |
+
DEFAULT_H_SLIDER_VALUE = 384 # (3 * 128)
|
69 |
+
DEFAULT_W_SLIDER_VALUE = 640 # (5 * 128)
|
|
|
70 |
DEFAULT_TARGET_AREA = float(DEFAULT_H_SLIDER_VALUE * DEFAULT_W_SLIDER_VALUE)
|
71 |
|
72 |
SLIDER_MIN_H = 128
|
73 |
+
SLIDER_MAX_H = 720
|
74 |
SLIDER_MIN_W = 128
|
75 |
+
SLIDER_MAX_W = 1280
|
76 |
|
77 |
def _calculate_new_dimensions_wan(pil_image: Image.Image, mod_val: int, target_area: float,
|
78 |
min_h: int, max_h: int, min_w: int, max_w: int,
|
|
|
85 |
|
86 |
aspect_ratio = orig_h / orig_w
|
87 |
|
|
|
88 |
ideal_h = np.sqrt(target_area * aspect_ratio)
|
89 |
ideal_w = np.sqrt(target_area / aspect_ratio)
|
90 |
|
|
|
91 |
calc_h = round(ideal_h / mod_val) * mod_val
|
92 |
calc_w = round(ideal_w / mod_val) * mod_val
|
93 |
|
94 |
+
calc_h = mod_val if calc_h < mod_val else calc_h # Ensure at least one mod_val unit
|
95 |
+
calc_w = mod_val if calc_w < mod_val else calc_w # Ensure at least one mod_val unit
|
|
|
96 |
|
|
|
97 |
new_h = int(np.clip(calc_h, min_h, max_h))
|
98 |
new_w = int(np.clip(calc_w, min_w, max_w))
|
99 |
|
|
|
104 |
return new_h, new_w
|
105 |
|
106 |
def handle_image_upload_for_dims_wan(uploaded_pil_image: Image.Image | None, current_h_val: int, current_w_val: int):
|
107 |
+
if uploaded_pil_image is None:
|
108 |
logger.info("Image cleared. Resetting dimensions to default slider values.")
|
109 |
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
|
|
|
110 |
try:
|
111 |
new_h, new_w = _calculate_new_dimensions_wan(
|
112 |
uploaded_pil_image,
|
113 |
+
MOD_VALUE, # Use the globally determined MOD_VALUE
|
114 |
DEFAULT_TARGET_AREA,
|
115 |
SLIDER_MIN_H, SLIDER_MAX_H,
|
116 |
SLIDER_MIN_W, SLIDER_MAX_W,
|
|
|
119 |
return gr.update(value=new_h), gr.update(value=new_w)
|
120 |
except Exception as e:
|
121 |
logger.error(f"Error auto-adjusting H/W from image: {e}", exc_info=True)
|
|
|
122 |
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
|
123 |
|
124 |
|
|
|
139 |
logger.info(f" Num Frames: {num_frames}, FPS for conditioning & export: {fps_for_conditioning_and_export}")
|
140 |
logger.info(f" Guidance Scale: {guidance_scale}, Steps: {steps}")
|
141 |
|
|
|
142 |
target_height = int(height)
|
143 |
target_width = int(width)
|
144 |
num_frames = int(num_frames)
|
|
|
146 |
guidance_scale_val = float(guidance_scale)
|
147 |
steps_val = int(steps)
|
148 |
|
149 |
+
# Ensure dimensions are compatible (already handled by slider steps and auto-adjustment)
|
150 |
+
if target_height % MOD_VALUE_H != 0:
|
151 |
+
logger.warning(f"Height {target_height} is not a multiple of {MOD_VALUE_H}. Adjusting...")
|
152 |
+
target_height = (target_height // MOD_VALUE_H) * MOD_VALUE_H
|
153 |
+
if target_width % MOD_VALUE_W != 0:
|
154 |
+
logger.warning(f"Width {target_width} is not a multiple of {MOD_VALUE_W}. Adjusting...")
|
155 |
+
target_width = (target_width // MOD_VALUE_W) * MOD_VALUE_W
|
156 |
+
|
157 |
+
target_height = max(MOD_VALUE_H, target_height) # Ensure minimum size
|
158 |
+
target_width = max(MOD_VALUE_W, target_width) # Ensure minimum size
|
159 |
+
|
160 |
+
|
161 |
resized_image = input_image.resize((target_width, target_height))
|
162 |
logger.info(f" Input image resized to: {resized_image.size} for pipeline input.")
|
163 |
|
|
|
164 |
with torch.inference_mode():
|
165 |
output_frames_list = pipe(
|
166 |
image=resized_image,
|
|
|
171 |
num_frames=num_frames,
|
172 |
guidance_scale=guidance_scale_val,
|
173 |
num_inference_steps=steps_val,
|
174 |
+
fps=fps_val,
|
175 |
+
generator=torch.Generator(device="cuda").manual_seed(0)
|
176 |
).frames[0]
|
177 |
|
178 |
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
|
179 |
video_path = tmpfile.name
|
180 |
|
181 |
+
export_to_video(output_frames_list, video_path, fps=fps_val)
|
182 |
logger.info(f"Video successfully generated and saved to {video_path}")
|
183 |
return video_path
|
184 |
|
|
|
187 |
default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
|
188 |
penguin_image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
|
189 |
|
|
|
190 |
with gr.Blocks() as demo:
|
191 |
gr.Markdown(f"""
|
192 |
# Image-to-Video with Wan 2.1 I2V (14B) + CausVid LoRA
|
|
|
207 |
lines=3
|
208 |
)
|
209 |
with gr.Row():
|
210 |
+
# Sliders use the general MOD_VALUE since MOD_VALUE_H and MOD_VALUE_W are the same here
|
211 |
height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
|
212 |
width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
|
213 |
with gr.Row():
|
|
|
221 |
with gr.Column(scale=3):
|
222 |
video_output = gr.Video(label="Generated Video", interactive=False)
|
223 |
|
|
|
224 |
input_image_component.change(
|
225 |
fn=handle_image_upload_for_dims_wan,
|
226 |
+
inputs=[input_image_component, height_input, width_input],
|
227 |
outputs=[height_input, width_input]
|
228 |
)
|
229 |
|
|
|
247 |
|
248 |
gr.Examples(
|
249 |
examples=[
|
250 |
+
[penguin_image_url, "a penguin playfully dancing in the snow, Antarctica", default_negative_prompt, DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, 25, 1.0, 4, 16],
|
251 |
],
|
252 |
inputs=inputs_for_click_and_examples,
|
253 |
outputs=video_output,
|