multimodalart HF Staff commited on
Commit
12d6cf5
·
verified ·
1 Parent(s): a2858d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -31
app.py CHANGED
@@ -9,7 +9,7 @@ import spaces # Assuming this is for Hugging Face Spaces GPU decorator
9
  from huggingface_hub import hf_hub_download
10
  import logging
11
  import numpy as np
12
- from PIL import Image # Added for type hinting
13
 
14
  # --- Global Model Loading & LoRA Handling ---
15
  MODEL_ID = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
@@ -57,22 +57,22 @@ pipe.load_lora_weights(causvid_path, adapter_name="causvid_lora")
57
  logger.info("Setting LoRA adapter...")
58
  pipe.set_adapters(["causvid_lora"], adapter_weights=[1.0])
59
 
60
- # MOD_VALUE for height/width constraints
61
- # From WanImageToVideoPipeline docs: height/width must be multiple of vae_scale_factor * transformer.config.patch_size[1 or 2]
62
- MOD_VALUE = pipe.vae_scale_factor * pipe.transformer.config.patch_size[1] # e.g., 8 * 16 = 128
63
- logger.info(f"Derived MOD_VALUE for dimensions: {MOD_VALUE}")
64
 
 
 
 
 
65
 
66
- # --- Helper functions and constants for automatic dimension adjustment ---
67
- # These constants must match the Gradio slider definitions below
68
- DEFAULT_H_SLIDER_VALUE = 384
69
- DEFAULT_W_SLIDER_VALUE = 640
70
  DEFAULT_TARGET_AREA = float(DEFAULT_H_SLIDER_VALUE * DEFAULT_W_SLIDER_VALUE)
71
 
72
  SLIDER_MIN_H = 128
73
- SLIDER_MAX_H = 512
74
  SLIDER_MIN_W = 128
75
- SLIDER_MAX_W = 1024
76
 
77
  def _calculate_new_dimensions_wan(pil_image: Image.Image, mod_val: int, target_area: float,
78
  min_h: int, max_h: int, min_w: int, max_w: int,
@@ -85,19 +85,15 @@ def _calculate_new_dimensions_wan(pil_image: Image.Image, mod_val: int, target_a
85
 
86
  aspect_ratio = orig_h / orig_w
87
 
88
- # Calculate ideal dimensions for the target area, maintaining aspect ratio
89
  ideal_h = np.sqrt(target_area * aspect_ratio)
90
  ideal_w = np.sqrt(target_area / aspect_ratio)
91
 
92
- # Round to nearest multiple of mod_val
93
  calc_h = round(ideal_h / mod_val) * mod_val
94
  calc_w = round(ideal_w / mod_val) * mod_val
95
 
96
- # Ensure dimensions are at least mod_val (smallest valid multiple)
97
- calc_h = mod_val if calc_h == 0 else calc_h
98
- calc_w = mod_val if calc_w == 0 else calc_w
99
 
100
- # Clamp to slider limits
101
  new_h = int(np.clip(calc_h, min_h, max_h))
102
  new_w = int(np.clip(calc_w, min_w, max_w))
103
 
@@ -108,14 +104,13 @@ def _calculate_new_dimensions_wan(pil_image: Image.Image, mod_val: int, target_a
108
  return new_h, new_w
109
 
110
  def handle_image_upload_for_dims_wan(uploaded_pil_image: Image.Image | None, current_h_val: int, current_w_val: int):
111
- if uploaded_pil_image is None: # Image cleared by user
112
  logger.info("Image cleared. Resetting dimensions to default slider values.")
113
  return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
114
-
115
  try:
116
  new_h, new_w = _calculate_new_dimensions_wan(
117
  uploaded_pil_image,
118
- MOD_VALUE,
119
  DEFAULT_TARGET_AREA,
120
  SLIDER_MIN_H, SLIDER_MAX_H,
121
  SLIDER_MIN_W, SLIDER_MAX_W,
@@ -124,7 +119,6 @@ def handle_image_upload_for_dims_wan(uploaded_pil_image: Image.Image | None, cur
124
  return gr.update(value=new_h), gr.update(value=new_w)
125
  except Exception as e:
126
  logger.error(f"Error auto-adjusting H/W from image: {e}", exc_info=True)
127
- # On error, revert to defaults or keep current. Defaults are safer.
128
  return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
129
 
130
 
@@ -145,7 +139,6 @@ def generate_video(input_image: Image.Image, prompt: str, negative_prompt: str,
145
  logger.info(f" Num Frames: {num_frames}, FPS for conditioning & export: {fps_for_conditioning_and_export}")
146
  logger.info(f" Guidance Scale: {guidance_scale}, Steps: {steps}")
147
 
148
-
149
  target_height = int(height)
150
  target_width = int(width)
151
  num_frames = int(num_frames)
@@ -153,11 +146,21 @@ def generate_video(input_image: Image.Image, prompt: str, negative_prompt: str,
153
  guidance_scale_val = float(guidance_scale)
154
  steps_val = int(steps)
155
 
156
- # Resize the input PIL image to the target dimensions for the pipeline
 
 
 
 
 
 
 
 
 
 
 
157
  resized_image = input_image.resize((target_width, target_height))
158
  logger.info(f" Input image resized to: {resized_image.size} for pipeline input.")
159
 
160
-
161
  with torch.inference_mode():
162
  output_frames_list = pipe(
163
  image=resized_image,
@@ -168,14 +171,14 @@ def generate_video(input_image: Image.Image, prompt: str, negative_prompt: str,
168
  num_frames=num_frames,
169
  guidance_scale=guidance_scale_val,
170
  num_inference_steps=steps_val,
171
- fps=fps_val, # For conditioning
172
- generator=torch.Generator(device="cuda").manual_seed(0) # For reproducibility
173
  ).frames[0]
174
 
175
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
176
  video_path = tmpfile.name
177
 
178
- export_to_video(output_frames_list, video_path, fps=fps_val) # For export
179
  logger.info(f"Video successfully generated and saved to {video_path}")
180
  return video_path
181
 
@@ -184,7 +187,6 @@ default_prompt_i2v = "make this image come alive, cinematic motion, smooth anima
184
  default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
185
  penguin_image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
186
 
187
-
188
  with gr.Blocks() as demo:
189
  gr.Markdown(f"""
190
  # Image-to-Video with Wan 2.1 I2V (14B) + CausVid LoRA
@@ -205,6 +207,7 @@ with gr.Blocks() as demo:
205
  lines=3
206
  )
207
  with gr.Row():
 
208
  height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
209
  width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
210
  with gr.Row():
@@ -218,10 +221,9 @@ with gr.Blocks() as demo:
218
  with gr.Column(scale=3):
219
  video_output = gr.Video(label="Generated Video", interactive=False)
220
 
221
- # Event handler for image upload/clear to adjust H/W sliders
222
  input_image_component.change(
223
  fn=handle_image_upload_for_dims_wan,
224
- inputs=[input_image_component, height_input, width_input], # Pass current slider values
225
  outputs=[height_input, width_input]
226
  )
227
 
@@ -245,7 +247,7 @@ with gr.Blocks() as demo:
245
 
246
  gr.Examples(
247
  examples=[
248
- [penguin_image_url, "a penguin playfully dancing in the snow, Antarctica", default_negative_prompt, DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, 25, 1.0, 4, 16]
249
  ],
250
  inputs=inputs_for_click_and_examples,
251
  outputs=video_output,
 
9
  from huggingface_hub import hf_hub_download
10
  import logging
11
  import numpy as np
12
+ from PIL import Image
13
 
14
  # --- Global Model Loading & LoRA Handling ---
15
  MODEL_ID = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
 
57
  logger.info("Setting LoRA adapter...")
58
  pipe.set_adapters(["causvid_lora"], adapter_weights=[1.0])
59
 
60
+ MOD_VALUE = 128
 
 
 
61
 
62
+ logger.info(f"VAE spatial scale factor: {vae_spatial_scale_factor}")
63
+ logger.info(f"Transformer patch HxW: {transformer_patch_height}x{transformer_patch_width}")
64
+ logger.info(f"Calculated MOD_VALUE for H step: {MOD_VALUE_H}, for W step: {MOD_VALUE_W}")
65
+ logger.info(f"Using general MOD_VALUE for slider steps: {MOD_VALUE}")
66
 
67
+
68
+ DEFAULT_H_SLIDER_VALUE = 384 # (3 * 128)
69
+ DEFAULT_W_SLIDER_VALUE = 640 # (5 * 128)
 
70
  DEFAULT_TARGET_AREA = float(DEFAULT_H_SLIDER_VALUE * DEFAULT_W_SLIDER_VALUE)
71
 
72
  SLIDER_MIN_H = 128
73
+ SLIDER_MAX_H = 720
74
  SLIDER_MIN_W = 128
75
+ SLIDER_MAX_W = 1280
76
 
77
  def _calculate_new_dimensions_wan(pil_image: Image.Image, mod_val: int, target_area: float,
78
  min_h: int, max_h: int, min_w: int, max_w: int,
 
85
 
86
  aspect_ratio = orig_h / orig_w
87
 
 
88
  ideal_h = np.sqrt(target_area * aspect_ratio)
89
  ideal_w = np.sqrt(target_area / aspect_ratio)
90
 
 
91
  calc_h = round(ideal_h / mod_val) * mod_val
92
  calc_w = round(ideal_w / mod_val) * mod_val
93
 
94
+ calc_h = mod_val if calc_h < mod_val else calc_h # Ensure at least one mod_val unit
95
+ calc_w = mod_val if calc_w < mod_val else calc_w # Ensure at least one mod_val unit
 
96
 
 
97
  new_h = int(np.clip(calc_h, min_h, max_h))
98
  new_w = int(np.clip(calc_w, min_w, max_w))
99
 
 
104
  return new_h, new_w
105
 
106
  def handle_image_upload_for_dims_wan(uploaded_pil_image: Image.Image | None, current_h_val: int, current_w_val: int):
107
+ if uploaded_pil_image is None:
108
  logger.info("Image cleared. Resetting dimensions to default slider values.")
109
  return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
 
110
  try:
111
  new_h, new_w = _calculate_new_dimensions_wan(
112
  uploaded_pil_image,
113
+ MOD_VALUE, # Use the globally determined MOD_VALUE
114
  DEFAULT_TARGET_AREA,
115
  SLIDER_MIN_H, SLIDER_MAX_H,
116
  SLIDER_MIN_W, SLIDER_MAX_W,
 
119
  return gr.update(value=new_h), gr.update(value=new_w)
120
  except Exception as e:
121
  logger.error(f"Error auto-adjusting H/W from image: {e}", exc_info=True)
 
122
  return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
123
 
124
 
 
139
  logger.info(f" Num Frames: {num_frames}, FPS for conditioning & export: {fps_for_conditioning_and_export}")
140
  logger.info(f" Guidance Scale: {guidance_scale}, Steps: {steps}")
141
 
 
142
  target_height = int(height)
143
  target_width = int(width)
144
  num_frames = int(num_frames)
 
146
  guidance_scale_val = float(guidance_scale)
147
  steps_val = int(steps)
148
 
149
+ # Ensure dimensions are compatible (already handled by slider steps and auto-adjustment)
150
+ if target_height % MOD_VALUE_H != 0:
151
+ logger.warning(f"Height {target_height} is not a multiple of {MOD_VALUE_H}. Adjusting...")
152
+ target_height = (target_height // MOD_VALUE_H) * MOD_VALUE_H
153
+ if target_width % MOD_VALUE_W != 0:
154
+ logger.warning(f"Width {target_width} is not a multiple of {MOD_VALUE_W}. Adjusting...")
155
+ target_width = (target_width // MOD_VALUE_W) * MOD_VALUE_W
156
+
157
+ target_height = max(MOD_VALUE_H, target_height) # Ensure minimum size
158
+ target_width = max(MOD_VALUE_W, target_width) # Ensure minimum size
159
+
160
+
161
  resized_image = input_image.resize((target_width, target_height))
162
  logger.info(f" Input image resized to: {resized_image.size} for pipeline input.")
163
 
 
164
  with torch.inference_mode():
165
  output_frames_list = pipe(
166
  image=resized_image,
 
171
  num_frames=num_frames,
172
  guidance_scale=guidance_scale_val,
173
  num_inference_steps=steps_val,
174
+ fps=fps_val,
175
+ generator=torch.Generator(device="cuda").manual_seed(0)
176
  ).frames[0]
177
 
178
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
179
  video_path = tmpfile.name
180
 
181
+ export_to_video(output_frames_list, video_path, fps=fps_val)
182
  logger.info(f"Video successfully generated and saved to {video_path}")
183
  return video_path
184
 
 
187
  default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
188
  penguin_image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
189
 
 
190
  with gr.Blocks() as demo:
191
  gr.Markdown(f"""
192
  # Image-to-Video with Wan 2.1 I2V (14B) + CausVid LoRA
 
207
  lines=3
208
  )
209
  with gr.Row():
210
+ # Sliders use the general MOD_VALUE since MOD_VALUE_H and MOD_VALUE_W are the same here
211
  height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
212
  width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
213
  with gr.Row():
 
221
  with gr.Column(scale=3):
222
  video_output = gr.Video(label="Generated Video", interactive=False)
223
 
 
224
  input_image_component.change(
225
  fn=handle_image_upload_for_dims_wan,
226
+ inputs=[input_image_component, height_input, width_input],
227
  outputs=[height_input, width_input]
228
  )
229
 
 
247
 
248
  gr.Examples(
249
  examples=[
250
+ [penguin_image_url, "a penguin playfully dancing in the snow, Antarctica", default_negative_prompt, DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, 25, 1.0, 4, 16],
251
  ],
252
  inputs=inputs_for_click_and_examples,
253
  outputs=video_output,