DeepBeepMeep commited on
Commit
6d57bc7
·
1 Parent(s): 26a6613

fixed vace bugs

Browse files
wan/diffusion_forcing.py CHANGED
@@ -15,6 +15,7 @@ from .modules.model import WanModel
15
  from .modules.t5 import T5EncoderModel
16
  from .modules.vae import WanVAE
17
  from wan.modules.posemb_layers import get_rotary_pos_embed
 
18
  from .utils.fm_solvers import (FlowDPMSolverMultistepScheduler,
19
  get_sampling_sigmas, retrieve_timesteps)
20
  from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
@@ -190,6 +191,7 @@ class DTT2V:
190
  input_video = None,
191
  height: int = 480,
192
  width: int = 832,
 
193
  num_frames: int = 97,
194
  num_inference_steps: int = 50,
195
  shift: float = 1.0,
@@ -221,15 +223,16 @@ class DTT2V:
221
  i2v_extra_kwrags = {}
222
  prefix_video = None
223
  predix_video_latent_length = 0
 
224
  if input_video != None:
225
  _ , _ , height, width = input_video.shape
226
  elif image != None:
227
  image = image[0]
228
  frame_width, frame_height = image.size
229
- scale = min(height / frame_height, width / frame_width)
230
- height = (int(frame_height * scale) // 16) * 16
231
- width = (int(frame_width * scale) // 16) * 16
232
  image = np.array(image.resize((width, height))).transpose(2, 0, 1)
 
 
233
  latent_length = (num_frames - 1) // 4 + 1
234
  latent_height = height // 8
235
  latent_width = width // 8
 
15
  from .modules.t5 import T5EncoderModel
16
  from .modules.vae import WanVAE
17
  from wan.modules.posemb_layers import get_rotary_pos_embed
18
+ from wan.utils.utils import calculate_new_dimensions
19
  from .utils.fm_solvers import (FlowDPMSolverMultistepScheduler,
20
  get_sampling_sigmas, retrieve_timesteps)
21
  from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
 
191
  input_video = None,
192
  height: int = 480,
193
  width: int = 832,
194
+ fit_into_canvas = True,
195
  num_frames: int = 97,
196
  num_inference_steps: int = 50,
197
  shift: float = 1.0,
 
223
  i2v_extra_kwrags = {}
224
  prefix_video = None
225
  predix_video_latent_length = 0
226
+
227
  if input_video != None:
228
  _ , _ , height, width = input_video.shape
229
  elif image != None:
230
  image = image[0]
231
  frame_width, frame_height = image.size
232
+ height, width = calculate_new_dimensions(height, width, frame_height, frame_width, fit_into_canvas)
 
 
233
  image = np.array(image.resize((width, height))).transpose(2, 0, 1)
234
+
235
+
236
  latent_length = (num_frames - 1) // 4 + 1
237
  latent_height = height // 8
238
  latent_width = width // 8
wan/image2video.py CHANGED
@@ -25,7 +25,7 @@ from .utils.fm_solvers import (FlowDPMSolverMultistepScheduler,
25
  get_sampling_sigmas, retrieve_timesteps)
26
  from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
27
  from wan.modules.posemb_layers import get_rotary_pos_embed
28
- from wan.utils.utils import resize_lanczos
29
 
30
  def optimized_scale(positive_flat, negative_flat):
31
 
@@ -120,7 +120,7 @@ class WanI2V:
120
  img2 = None,
121
  height =720,
122
  width = 1280,
123
- max_area=720 * 1280,
124
  frame_num=81,
125
  shift=5.0,
126
  sample_solver='unipc',
@@ -188,22 +188,16 @@ class WanI2V:
188
  if add_frames_for_end_image:
189
  frame_num +=1
190
  lat_frames = int((frame_num - 2) // self.vae_stride[0] + 2)
191
-
192
-
193
  h, w = img.shape[1:]
194
- # aspect_ratio = h / w
195
-
196
- scale1 = min(height / h, width / w)
197
- scale2 = min(height / h, width / w)
198
- scale = max(scale1, scale2)
199
- new_height = int(h * scale)
200
- new_width = int(w * scale)
201
 
 
 
202
  lat_h = round(
203
- new_height // self.vae_stride[1] //
204
  self.patch_size[1] * self.patch_size[1])
205
  lat_w = round(
206
- new_width // self.vae_stride[2] //
207
  self.patch_size[2] * self.patch_size[2])
208
  h = lat_h * self.vae_stride[1]
209
  w = lat_w * self.vae_stride[2]
 
25
  get_sampling_sigmas, retrieve_timesteps)
26
  from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
27
  from wan.modules.posemb_layers import get_rotary_pos_embed
28
+ from wan.utils.utils import resize_lanczos, calculate_new_dimensions
29
 
30
  def optimized_scale(positive_flat, negative_flat):
31
 
 
120
  img2 = None,
121
  height =720,
122
  width = 1280,
123
+ fit_into_canvas = True,
124
  frame_num=81,
125
  shift=5.0,
126
  sample_solver='unipc',
 
188
  if add_frames_for_end_image:
189
  frame_num +=1
190
  lat_frames = int((frame_num - 2) // self.vae_stride[0] + 2)
191
+
 
192
  h, w = img.shape[1:]
 
 
 
 
 
 
 
193
 
194
+ h, w = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
195
+
196
  lat_h = round(
197
+ h // self.vae_stride[1] //
198
  self.patch_size[1] * self.patch_size[1])
199
  lat_w = round(
200
+ w // self.vae_stride[2] //
201
  self.patch_size[2] * self.patch_size[2])
202
  h = lat_h * self.vae_stride[1]
203
  w = lat_w * self.vae_stride[2]
wan/modules/model.py CHANGED
@@ -963,7 +963,7 @@ class WanModel(ModelMixin, ConfigMixin):
963
  hints_list = [None ] *len(x_list)
964
  else:
965
  # Vace embeddings
966
- c = [self.vace_patch_embedding(u.unsqueeze(0)) for u in vace_context]
967
  c = [u.flatten(2).transpose(1, 2) for u in c]
968
  c = c[0]
969
 
 
963
  hints_list = [None ] *len(x_list)
964
  else:
965
  # Vace embeddings
966
+ c = [self.vace_patch_embedding(u.to(self.vace_patch_embedding.weight.dtype).unsqueeze(0)) for u in vace_context]
967
  c = [u.flatten(2).transpose(1, 2) for u in c]
968
  c = c[0]
969
 
wan/text2video.py CHANGED
@@ -177,15 +177,16 @@ class WanT2V:
177
  def vace_latent(self, z, m):
178
  return [torch.cat([zz, mm], dim=0) for zz, mm in zip(z, m)]
179
 
180
- def prepare_source(self, src_video, src_mask, src_ref_images, total_frames, image_size, device, original_video = False, keep_frames= [], start_frame = 0, pre_src_video = None):
181
  image_sizes = []
182
  trim_video = len(keep_frames)
 
183
 
184
  for i, (sub_src_video, sub_src_mask, sub_pre_src_video) in enumerate(zip(src_video, src_mask,pre_src_video)):
185
  prepend_count = 0 if sub_pre_src_video == None else sub_pre_src_video.shape[1]
186
  num_frames = total_frames - prepend_count
187
  if sub_src_mask is not None and sub_src_video is not None:
188
- src_video[i], src_mask[i], _, _, _ = self.vid_proc.load_video_pair(sub_src_video, sub_src_mask, max_frames= num_frames, trim_video = trim_video - prepend_count, start_frame = start_frame)
189
  # src_video is [-1, 1], 0 = inpainting area (in fact 127 in [0, 255])
190
  # src_mask is [-1, 1], 0 = preserve original video (in fact 127 in [0, 255]) and 1 = Inpainting (in fact 255 in [0, 255])
191
  src_video[i] = src_video[i].to(device)
@@ -208,7 +209,7 @@ class WanT2V:
208
  src_mask[i] = torch.ones_like(src_video[i], device=device)
209
  image_sizes.append(image_size)
210
  else:
211
- src_video[i], _, _, _ = self.vid_proc.load_video(sub_src_video, max_frames= num_frames, trim_video = trim_video - prepend_count, start_frame = start_frame)
212
  src_video[i] = src_video[i].to(device)
213
  src_mask[i] = torch.zeros_like(src_video[i], device=device) if original_video else torch.ones_like(src_video[i], device=device)
214
  if prepend_count > 0:
@@ -277,6 +278,7 @@ class WanT2V:
277
  target_camera=None,
278
  context_scale=1.0,
279
  size=(1280, 720),
 
280
  frame_num=81,
281
  shift=5.0,
282
  sample_solver='unipc',
@@ -430,7 +432,7 @@ class WanT2V:
430
  kwargs.update({'cam_emb': cam_emb})
431
 
432
  if vace:
433
- ref_images_count = len(input_ref_images[0]) if input_ref_images != None else 0
434
  kwargs.update({'vace_context' : z, 'vace_context_scale' : context_scale})
435
  if overlapped_latents > 0:
436
  z_reactive = [ zz[0:16, ref_images_count:overlapped_latents + ref_images_count].clone() for zz in z]
 
177
  def vace_latent(self, z, m):
178
  return [torch.cat([zz, mm], dim=0) for zz, mm in zip(z, m)]
179
 
180
+ def prepare_source(self, src_video, src_mask, src_ref_images, total_frames, image_size, device, original_video = False, keep_frames= [], start_frame = 0, fit_into_canvas = True, pre_src_video = None):
181
  image_sizes = []
182
  trim_video = len(keep_frames)
183
+ canvas_height, canvas_width = image_size
184
 
185
  for i, (sub_src_video, sub_src_mask, sub_pre_src_video) in enumerate(zip(src_video, src_mask,pre_src_video)):
186
  prepend_count = 0 if sub_pre_src_video == None else sub_pre_src_video.shape[1]
187
  num_frames = total_frames - prepend_count
188
  if sub_src_mask is not None and sub_src_video is not None:
189
+ src_video[i], src_mask[i], _, _, _ = self.vid_proc.load_video_pair(sub_src_video, sub_src_mask, max_frames= num_frames, trim_video = trim_video - prepend_count, start_frame = start_frame, canvas_height = canvas_height, canvas_width = canvas_width, fit_into_canvas = fit_into_canvas)
190
  # src_video is [-1, 1], 0 = inpainting area (in fact 127 in [0, 255])
191
  # src_mask is [-1, 1], 0 = preserve original video (in fact 127 in [0, 255]) and 1 = Inpainting (in fact 255 in [0, 255])
192
  src_video[i] = src_video[i].to(device)
 
209
  src_mask[i] = torch.ones_like(src_video[i], device=device)
210
  image_sizes.append(image_size)
211
  else:
212
+ src_video[i], _, _, _ = self.vid_proc.load_video(sub_src_video, max_frames= num_frames, trim_video = trim_video - prepend_count, start_frame = start_frame, canvas_height = canvas_height, canvas_width = canvas_width, fit_into_canvas = fit_into_canvas)
213
  src_video[i] = src_video[i].to(device)
214
  src_mask[i] = torch.zeros_like(src_video[i], device=device) if original_video else torch.ones_like(src_video[i], device=device)
215
  if prepend_count > 0:
 
278
  target_camera=None,
279
  context_scale=1.0,
280
  size=(1280, 720),
281
+ fit_into_canvas = True,
282
  frame_num=81,
283
  shift=5.0,
284
  sample_solver='unipc',
 
432
  kwargs.update({'cam_emb': cam_emb})
433
 
434
  if vace:
435
+ ref_images_count = len(input_ref_images[0]) if input_ref_images != None and input_ref_images[0] != None else 0
436
  kwargs.update({'vace_context' : z, 'vace_context_scale' : context_scale})
437
  if overlapped_latents > 0:
438
  z_reactive = [ zz[0:16, ref_images_count:overlapped_latents + ref_images_count].clone() for zz in z]
wan/utils/utils.py CHANGED
@@ -67,7 +67,17 @@ def remove_background(img, session=None):
67
  return torch.from_numpy(np.array(img).astype(np.float32) / 255.0).movedim(-1, 0)
68
 
69
 
 
 
 
 
 
 
 
70
 
 
 
 
71
 
72
  def resize_and_remove_background(img_list, budget_width, budget_height, rm_background, fit_into_canvas = False ):
73
  if rm_background:
 
67
  return torch.from_numpy(np.array(img).astype(np.float32) / 255.0).movedim(-1, 0)
68
 
69
 
70
+ def calculate_new_dimensions(canvas_height, canvas_width, height, width, fit_into_canvas):
71
+ if fit_into_canvas:
72
+ scale1 = min(canvas_height / height, canvas_width / width)
73
+ scale2 = min(canvas_width / height, canvas_height / width)
74
+ scale = max(scale1, scale2)
75
+ else:
76
+ scale = (canvas_height * canvas_width / (height * width))**(1/2)
77
 
78
+ new_height = round( height * scale / 16) * 16
79
+ new_width = round( width * scale / 16) * 16
80
+ return new_height, new_width
81
 
82
  def resize_and_remove_background(img_list, budget_width, budget_height, rm_background, fit_into_canvas = False ):
83
  if rm_background:
wan/utils/vace_preprocessor.py CHANGED
@@ -5,6 +5,7 @@ from PIL import Image
5
  import torch
6
  import torch.nn.functional as F
7
  import torchvision.transforms.functional as TF
 
8
 
9
 
10
  class VaceImageProcessor(object):
@@ -182,53 +183,22 @@ class VaceVideoProcessor(object):
182
 
183
 
184
 
185
- def _get_frameid_bbox_adjust_last(self, fps, video_frames_count, h, w, crop_box, rng, max_frames= 0, start_frame =0):
186
  from wan.utils.utils import resample
187
 
188
  target_fps = self.max_fps
189
 
190
- # video_frames_count = len(frame_timestamps)
191
-
192
  frame_ids= resample(fps, video_frames_count, max_frames, target_fps, start_frame )
193
 
194
  x1, x2, y1, y2 = [0, w, 0, h] if crop_box is None else crop_box
195
  h, w = y2 - y1, x2 - x1
196
- ratio = h / w
197
- df, dh, dw = self.downsample
198
- seq_len = self.seq_len
199
- # min/max area of the [latent video]
200
- min_area_z = self.min_area / (dh * dw)
201
- # max_area_z = min(seq_len, self.max_area / (dh * dw), (h // dh) * (w // dw))
202
- max_area_z = min_area_z # workaround bug
203
- # sample a frame number of the [latent video]
204
- rand_area_z = np.square(np.power(2, rng.uniform(
205
- np.log2(np.sqrt(min_area_z)),
206
- np.log2(np.sqrt(max_area_z))
207
- )))
208
-
209
- seq_len = max_area_z * ((max_frames- start_frame - 1) // df +1)
210
-
211
- # of = min(
212
- # (len(frame_ids) - 1) // df + 1,
213
- # int(seq_len / rand_area_z)
214
- # )
215
- of = (len(frame_ids) - 1) // df + 1
216
-
217
-
218
- # deduce target shape of the [latent video]
219
- # target_area_z = min(max_area_z, int(seq_len / of))
220
- target_area_z = max_area_z
221
- oh = round(np.sqrt(target_area_z * ratio))
222
- ow = int(target_area_z / oh)
223
- of = (of - 1) * df + 1
224
- oh *= dh
225
- ow *= dw
226
 
227
  return frame_ids, (x1, x2, y1, y2), (oh, ow), target_fps
228
 
229
- def _get_frameid_bbox(self, fps, video_frames_count, h, w, crop_box, rng, max_frames= 0, start_frame= 0):
230
  if self.keep_last:
231
- return self._get_frameid_bbox_adjust_last(fps, video_frames_count, h, w, crop_box, rng, max_frames= max_frames, start_frame= start_frame)
232
  else:
233
  return self._get_frameid_bbox_default(fps, video_frames_count, h, w, crop_box, rng, max_frames= max_frames)
234
 
@@ -238,7 +208,7 @@ class VaceVideoProcessor(object):
238
  def load_video_pair(self, data_key, data_key2, crop_box=None, seed=2024, **kwargs):
239
  return self.load_video_batch(data_key, data_key2, crop_box=crop_box, seed=seed, **kwargs)
240
 
241
- def load_video_batch(self, *data_key_batch, crop_box=None, seed=2024, max_frames= 0, trim_video =0, start_frame = 0, **kwargs):
242
  rng = np.random.default_rng(seed + hash(data_key_batch[0]) % 10000)
243
  # read video
244
  import decord
@@ -269,7 +239,7 @@ class VaceVideoProcessor(object):
269
  h, w = src_video.shape[1:3]
270
  else:
271
  h, w = readers[0].next().shape[:2]
272
- frame_ids, (x1, x2, y1, y2), (oh, ow), fps = self._get_frameid_bbox(fps, length, h, w, crop_box, rng, max_frames=max_frames, start_frame = start_frame )
273
 
274
  # preprocess video
275
  videos = [reader.get_batch(frame_ids)[:, y1:y2, x1:x2, :] for reader in readers]
 
5
  import torch
6
  import torch.nn.functional as F
7
  import torchvision.transforms.functional as TF
8
+ from .utils import calculate_new_dimensions
9
 
10
 
11
  class VaceImageProcessor(object):
 
183
 
184
 
185
 
186
+ def _get_frameid_bbox_adjust_last(self, fps, video_frames_count, canvas_height, canvas_width, h, w, fit_into_canvas, crop_box, rng, max_frames= 0, start_frame =0):
187
  from wan.utils.utils import resample
188
 
189
  target_fps = self.max_fps
190
 
 
 
191
  frame_ids= resample(fps, video_frames_count, max_frames, target_fps, start_frame )
192
 
193
  x1, x2, y1, y2 = [0, w, 0, h] if crop_box is None else crop_box
194
  h, w = y2 - y1, x2 - x1
195
+ oh, ow = calculate_new_dimensions(canvas_height, canvas_width, h, w, fit_into_canvas)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  return frame_ids, (x1, x2, y1, y2), (oh, ow), target_fps
198
 
199
+ def _get_frameid_bbox(self, fps, video_frames_count, h, w, crop_box, rng, max_frames= 0, start_frame= 0, canvas_height = 0, canvas_width = 0, fit_into_canvas= True):
200
  if self.keep_last:
201
+ return self._get_frameid_bbox_adjust_last(fps, video_frames_count, canvas_height, canvas_width, h, w, fit_into_canvas, crop_box, rng, max_frames= max_frames, start_frame= start_frame)
202
  else:
203
  return self._get_frameid_bbox_default(fps, video_frames_count, h, w, crop_box, rng, max_frames= max_frames)
204
 
 
208
  def load_video_pair(self, data_key, data_key2, crop_box=None, seed=2024, **kwargs):
209
  return self.load_video_batch(data_key, data_key2, crop_box=crop_box, seed=seed, **kwargs)
210
 
211
+ def load_video_batch(self, *data_key_batch, crop_box=None, seed=2024, max_frames= 0, trim_video =0, start_frame = 0, canvas_height = 0, canvas_width = 0, fit_into_canvas = False, **kwargs):
212
  rng = np.random.default_rng(seed + hash(data_key_batch[0]) % 10000)
213
  # read video
214
  import decord
 
239
  h, w = src_video.shape[1:3]
240
  else:
241
  h, w = readers[0].next().shape[:2]
242
+ frame_ids, (x1, x2, y1, y2), (oh, ow), fps = self._get_frameid_bbox(fps, length, h, w, crop_box, rng, canvas_height = canvas_height, canvas_width = canvas_width, fit_into_canvas = fit_into_canvas, max_frames=max_frames, start_frame = start_frame )
243
 
244
  # preprocess video
245
  videos = [reader.get_batch(frame_ids)[:, y1:y2, x1:x2, :] for reader in readers]
wgp.py CHANGED
@@ -84,7 +84,6 @@ def format_time(seconds):
84
  hours = int(seconds // 3600)
85
  minutes = int((seconds % 3600) // 60)
86
  return f"{hours}h {minutes}m"
87
-
88
  def pil_to_base64_uri(pil_image, format="png", quality=75):
89
  if pil_image is None:
90
  return None
@@ -275,12 +274,12 @@ def process_prompt_and_add_tasks(state, model_choice):
275
  video_guide = inputs["video_guide"]
276
  video_mask = inputs["video_mask"]
277
 
278
- if "1.3B" in model_filename :
279
- resolution_reformated = str(height) + "*" + str(width)
280
- if not resolution_reformated in VACE_SIZE_CONFIGS:
281
- res = (" and ").join(VACE_SIZE_CONFIGS.keys())
282
- gr.Info(f"Video Resolution for Vace model is not supported. Only {res} resolutions are allowed.")
283
- return
284
  if "I" in video_prompt_type:
285
  if image_refs == None:
286
  gr.Info("You must provide at least one Refererence Image")
@@ -1995,7 +1994,8 @@ def apply_changes( state,
1995
  boost_choice = 1,
1996
  clear_file_list = 0,
1997
  preload_model_policy_choice = 1,
1998
- UI_theme_choice = "default"
 
1999
  ):
2000
  if args.lock_config:
2001
  return
@@ -2016,7 +2016,8 @@ def apply_changes( state,
2016
  "boost" : boost_choice,
2017
  "clear_file_list" : clear_file_list,
2018
  "preload_model_policy" : preload_model_policy_choice,
2019
- "UI_theme" : UI_theme_choice
 
2020
  }
2021
 
2022
  if Path(server_config_filename).is_file():
@@ -2050,7 +2051,7 @@ def apply_changes( state,
2050
  transformer_quantization = server_config["transformer_quantization"]
2051
  transformer_types = server_config["transformer_types"]
2052
 
2053
- if all(change in ["attention_mode", "vae_config", "boost", "save_path", "metadata_type", "clear_file_list"] for change in changes ):
2054
  model_choice = gr.Dropdown()
2055
  else:
2056
  reload_needed = True
@@ -2413,7 +2414,7 @@ def generate_video(
2413
  file_list = gen["file_list"]
2414
  prompt_no = gen["prompt_no"]
2415
 
2416
-
2417
  # if wan_model == None:
2418
  # gr.Info("Unable to generate a Video while a new configuration is being applied.")
2419
  # return
@@ -2555,7 +2556,7 @@ def generate_video(
2555
  source_video = None
2556
  target_camera = None
2557
  if "recam" in model_filename:
2558
- source_video = preprocess_video("", width=width, height=height,video_in=video_source, max_frames= video_length, start_frame = 0, fit_canvas= True)
2559
  target_camera = model_mode
2560
 
2561
  audio_proj_split = None
@@ -2646,7 +2647,7 @@ def generate_video(
2646
  elif diffusion_forcing:
2647
  if video_source != None and len(video_source) > 0 and window_no == 1:
2648
  keep_frames_video_source= 1000 if len(keep_frames_video_source) ==0 else int(keep_frames_video_source)
2649
- prefix_video = preprocess_video(None, width=width, height=height,video_in=video_source, max_frames= keep_frames_video_source , start_frame = 0, fit_canvas= True, target_fps = fps)
2650
  prefix_video = prefix_video .permute(3, 0, 1, 2)
2651
  prefix_video = prefix_video .float().div_(127.5).sub_(1.) # c, f, h, w
2652
  prefix_video_frames_count = prefix_video.shape[1]
@@ -2675,13 +2676,13 @@ def generate_video(
2675
 
2676
  if preprocess_type != None :
2677
  send_cmd("progress", progress_args)
2678
- video_guide_copy = preprocess_video(preprocess_type, width=width, height=height,video_in=video_guide, max_frames= video_length if window_no == 1 else video_length - reuse_frames, start_frame = guide_start_frame, fit_canvas = True, target_fps = fps)
2679
  keep_frames_parsed, error = parse_keep_frames_video_guide(keep_frames_video_guide, max_frames_to_generate)
2680
  if len(error) > 0:
2681
  raise gr.Error(f"invalid keep frames {keep_frames_video_guide}")
2682
  keep_frames_parsed = keep_frames_parsed[guide_start_frame: guide_start_frame + video_length]
2683
  if window_no == 1:
2684
- image_size = VACE_SIZE_CONFIGS[resolution_reformated] # default frame dimensions until it is set by video_src (if there is any)
2685
  src_video, src_mask, src_ref_images = wan_model.prepare_source([video_guide_copy],
2686
  [video_mask_copy ],
2687
  [image_refs_copy],
@@ -2689,10 +2690,11 @@ def generate_video(
2689
  original_video= "O" in video_prompt_type,
2690
  keep_frames=keep_frames_parsed,
2691
  start_frame = guide_start_frame,
2692
- pre_src_video = [pre_video_guide]
 
2693
  )
2694
- if window_no == 1 and src_video != None and len(src_video) > 0:
2695
- image_size = src_video[0].shape[-2:]
2696
  prompts_max = gen["prompts_max"]
2697
  status = get_latest_status(state)
2698
 
@@ -2722,6 +2724,7 @@ def generate_video(
2722
  # max_area=MAX_AREA_CONFIGS[resolution_reformated],
2723
  height = height,
2724
  width = width,
 
2725
  shift=flow_shift,
2726
  sampling_steps=num_inference_steps,
2727
  guide_scale=guidance_scale,
@@ -2750,6 +2753,7 @@ def generate_video(
2750
  input_video= pre_video_guide,
2751
  height = height,
2752
  width = width,
 
2753
  seed = seed,
2754
  num_frames = (video_length // 4)* 4 + 1, #377
2755
  num_inference_steps = num_inference_steps,
@@ -2777,6 +2781,7 @@ def generate_video(
2777
  target_camera= target_camera,
2778
  frame_num=(video_length // 4)* 4 + 1,
2779
  size=(width, height),
 
2780
  shift=flow_shift,
2781
  sampling_steps=num_inference_steps,
2782
  guide_scale=guidance_scale,
@@ -4042,39 +4047,35 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
4042
  wizard_prompt_activated_var = gr.Text(wizard_prompt_activated, visible= False)
4043
  wizard_variables_var = gr.Text(wizard_variables, visible = False)
4044
  with gr.Row():
4045
- if test_class_i2v(model_filename) and False:
4046
- resolution = gr.Dropdown(
4047
- choices=[
4048
- # 720p
4049
- ("720p (same amount of pixels)", "1280x720"),
4050
- ("480p (same amount of pixels)", "832x480"),
4051
- ],
4052
- value=ui_defaults.get("resolution","480p"),
4053
- label="Resolution (video will have the same height / width ratio than the original image)"
4054
- )
4055
  else:
4056
- resolution = gr.Dropdown(
4057
- choices=[
4058
- # 720p
4059
- ("1280x720 (16:9, 720p)", "1280x720"),
4060
- ("720x1280 (9:16, 720p)", "720x1280"),
4061
- ("1024x1024 (4:3, 720p)", "1024x024"),
4062
- ("832x1104 (3:4, 720p)", "832x1104"),
4063
- ("1104x832 (3:4, 720p)", "1104x832"),
4064
- ("960x960 (1:1, 720p)", "960x960"),
4065
- # 480p
4066
- ("960x544 (16:9, 540p)", "960x544"),
4067
- ("544x960 (16:9, 540p)", "544x960"),
4068
- ("832x480 (16:9, 480p)", "832x480"),
4069
- ("480x832 (9:16, 480p)", "480x832"),
4070
- ("832x624 (4:3, 480p)", "832x624"),
4071
- ("624x832 (3:4, 480p)", "624x832"),
4072
- ("720x720 (1:1, 480p)", "720x720"),
4073
- ("512x512 (1:1, 480p)", "512x512"),
4074
- ],
4075
- value=ui_defaults.get("resolution","832x480"),
4076
- label="Max Resolution (as it maybe less depending on video width / height ratio)" if test_class_i2v(model_filename) else "Resolution"
4077
- )
 
4078
  with gr.Row():
4079
  if recammaster:
4080
  video_length = gr.Slider(5, 193, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (16 = 1s), locked", interactive= False)
@@ -4556,156 +4557,181 @@ def generate_configuration_tab(state, blocks, header, model_choice):
4556
  with gr.Column():
4557
  model_list = []
4558
 
4559
- for model_type in model_types:
4560
- choice = get_model_filename(model_type, transformer_quantization)
4561
- model_list.append(choice)
4562
- dropdown_choices = [ ( get_model_name(choice), get_model_type(choice) ) for choice in model_list]
4563
- transformer_types_choices = gr.Dropdown(
4564
- choices= dropdown_choices,
4565
- value= transformer_types,
4566
- label= "Selectable Wan Transformer Models (keep empty to get All of them)",
4567
- scale= 2,
4568
- multiselect= True
4569
- )
4570
 
4571
- quantization_choice = gr.Dropdown(
4572
- choices=[
4573
- ("Scaled Int8 Quantization (recommended)", "int8"),
4574
- ("16 bits (no quantization)", "bf16"),
4575
- ],
4576
- value= transformer_quantization,
4577
- label="Wan Transformer Model Quantization Type (if available)",
4578
- )
4579
-
4580
- mixed_precision_choice = gr.Dropdown(
4581
- choices=[
4582
- ("16 bits only, requires less VRAM", "0"),
4583
- ("Mixed 16 / 32 bits, slightly more VRAM needed but better Quality", "1"),
4584
- ],
4585
- value= server_config.get("mixed_precision", "0"),
4586
- label="Transformer Engine Calculation"
4587
- )
4588
-
4589
- index = text_encoder_choices.index(text_encoder_filename)
4590
- index = 0 if index ==0 else index
4591
- text_encoder_choice = gr.Dropdown(
4592
- choices=[
4593
- ("UMT5 XXL 16 bits - unquantized text encoder, better quality uses more RAM", 0),
4594
- ("UMT5 XXL quantized to 8 bits - quantized text encoder, slightly worse quality but uses less RAM", 1),
4595
- ],
4596
- value= index,
4597
- label="Text Encoder model"
4598
- )
4599
-
4600
- VAE_precision_choice = gr.Dropdown(
4601
- choices=[
4602
- ("16 bits, requires less VRAM and faster", "16"),
4603
- ("32 bits, requires twice more VRAM and slower but recommended with Window Sliding", "32"),
4604
- ],
4605
- value= server_config.get("vae_precision", "16"),
4606
- label="VAE Encoding / Decoding precision"
4607
- )
4608
-
4609
- save_path_choice = gr.Textbox(
4610
- label="Output Folder for Generated Videos",
4611
- value=server_config.get("save_path", save_path)
4612
- )
4613
- def check(mode):
4614
- if not mode in attention_modes_installed:
4615
- return " (NOT INSTALLED)"
4616
- elif not mode in attention_modes_supported:
4617
- return " (NOT SUPPORTED)"
4618
- else:
4619
- return ""
4620
- attention_choice = gr.Dropdown(
4621
- choices=[
4622
- ("Auto : pick sage2 > sage > sdpa depending on what is installed", "auto"),
4623
- ("Scale Dot Product Attention: default, always available", "sdpa"),
4624
- ("Flash" + check("flash")+ ": good quality - requires additional install (usually complex to set up on Windows without WSL)", "flash"),
4625
- ("Xformers" + check("xformers")+ ": good quality - requires additional install (usually complex, may consume less VRAM to set up on Windows without WSL)", "xformers"),
4626
- ("Sage" + check("sage")+ ": 30% faster but slightly worse quality - requires additional install (usually complex to set up on Windows without WSL)", "sage"),
4627
- ("Sage2" + check("sage2")+ ": 40% faster but slightly worse quality - requires additional install (usually complex to set up on Windows without WSL)", "sage2"),
4628
- ],
4629
- value= attention_mode,
4630
- label="Attention Type",
4631
- interactive= not lock_ui_attention
4632
- )
4633
- gr.Markdown("Beware: when restarting the server or changing a resolution or video duration, the first step of generation for a duration / resolution may last a few minutes due to recompilation")
4634
- compile_choice = gr.Dropdown(
4635
- choices=[
4636
- ("ON: works only on Linux / WSL", "transformer"),
4637
- ("OFF: no other choice if you have Windows without using WSL", "" ),
4638
- ],
4639
- value= compile,
4640
- label="Compile Transformer (up to 50% faster and 30% more frames but requires Linux / WSL and Flash or Sage attention)",
4641
- interactive= not lock_ui_compile
4642
- )
4643
- vae_config_choice = gr.Dropdown(
4644
- choices=[
4645
- ("Auto", 0),
4646
- ("Disabled (faster but may require up to 22 GB of VRAM)", 1),
4647
- ("256 x 256 : If at least 8 GB of VRAM", 2),
4648
- ("128 x 128 : If at least 6 GB of VRAM", 3),
4649
- ],
4650
- value= vae_config,
4651
- label="VAE Tiling - reduce the high VRAM requirements for VAE decoding and VAE encoding (if enabled it will be slower)"
4652
- )
4653
- boost_choice = gr.Dropdown(
4654
- choices=[
4655
- # ("Auto (ON if Video longer than 5s)", 0),
4656
- ("ON", 1),
4657
- ("OFF", 2),
4658
- ],
4659
- value=boost,
4660
- label="Boost: Give a 10% speed speedup without losing quality at the cost of a litle VRAM (up to 1GB for max frames and resolution)"
4661
- )
4662
- profile_choice = gr.Dropdown(
4663
- choices=[
4664
- ("HighRAM_HighVRAM, profile 1: at least 48 GB of RAM and 24 GB of VRAM, the fastest for short videos a RTX 3090 / RTX 4090", 1),
4665
- ("HighRAM_LowVRAM, profile 2 (Recommended): at least 48 GB of RAM and 12 GB of VRAM, the most versatile profile with high RAM, better suited for RTX 3070/3080/4070/4080 or for RTX 3090 / RTX 4090 with large pictures batches or long videos", 2),
4666
- ("LowRAM_HighVRAM, profile 3: at least 32 GB of RAM and 24 GB of VRAM, adapted for RTX 3090 / RTX 4090 with limited RAM for good speed short video",3),
4667
- ("LowRAM_LowVRAM, profile 4 (Default): at least 32 GB of RAM and 12 GB of VRAM, if you have little VRAM or want to generate longer videos",4),
4668
- ("VerylowRAM_LowVRAM, profile 5: (Fail safe): at least 16 GB of RAM and 10 GB of VRAM, if you don't have much it won't be fast but maybe it will work",5)
4669
- ],
4670
- value= profile,
4671
- label="Profile (for power users only, not needed to change it)"
4672
- )
4673
-
4674
- metadata_choice = gr.Dropdown(
4675
- choices=[
4676
- ("Export JSON files", "json"),
4677
- ("Add metadata to video", "metadata"),
4678
- ("Neither", "none")
4679
- ],
4680
- value=server_config.get("metadata_type", "metadata"),
4681
- label="Metadata Handling"
4682
- )
4683
- preload_model_policy_choice = gr.CheckboxGroup([("Preload Model while Launching the App","P"), ("Preload Model while Switching Model", "S"), ("Unload Model when Queue is Done", "U")],
4684
- value=server_config.get("preload_model_policy",[]),
4685
- label="RAM Loading / Unloading Model Policy (in any case VRAM will be freed once the queue has been processed)"
4686
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4687
 
4688
- clear_file_list_choice = gr.Dropdown(
4689
- choices=[
4690
- ("None", 0),
4691
- ("Keep the last video", 1),
4692
- ("Keep the last 5 videos", 5),
4693
- ("Keep the last 10 videos", 10),
4694
- ("Keep the last 20 videos", 20),
4695
- ("Keep the last 30 videos", 30),
4696
- ],
4697
- value=server_config.get("clear_file_list", 5),
4698
- label="Keep Previously Generated Videos when starting a Generation Batch"
4699
- )
4700
 
4701
- UI_theme_choice = gr.Dropdown(
4702
- choices=[
4703
- ("Blue Sky", "default"),
4704
- ("Classic Gradio", "gradio"),
4705
- ],
4706
- value=server_config.get("UI_theme_choice", "default"),
4707
- label="User Interface Theme. You will need to restart the App the see new Theme."
4708
- )
4709
 
4710
 
4711
  msg = gr.Markdown()
@@ -4728,7 +4754,8 @@ def generate_configuration_tab(state, blocks, header, model_choice):
4728
  boost_choice,
4729
  clear_file_list_choice,
4730
  preload_model_policy_choice,
4731
- UI_theme_choice
 
4732
  ],
4733
  outputs= [msg , header, model_choice]
4734
  )
 
84
  hours = int(seconds // 3600)
85
  minutes = int((seconds % 3600) // 60)
86
  return f"{hours}h {minutes}m"
 
87
  def pil_to_base64_uri(pil_image, format="png", quality=75):
88
  if pil_image is None:
89
  return None
 
274
  video_guide = inputs["video_guide"]
275
  video_mask = inputs["video_mask"]
276
 
277
+ # if "1.3B" in model_filename :
278
+ # resolution_reformated = str(height) + "*" + str(width)
279
+ # if not resolution_reformated in VACE_SIZE_CONFIGS:
280
+ # res = (" and ").join(VACE_SIZE_CONFIGS.keys())
281
+ # gr.Info(f"Video Resolution for Vace model is not supported. Only {res} resolutions are allowed.")
282
+ # return
283
  if "I" in video_prompt_type:
284
  if image_refs == None:
285
  gr.Info("You must provide at least one Refererence Image")
 
1994
  boost_choice = 1,
1995
  clear_file_list = 0,
1996
  preload_model_policy_choice = 1,
1997
+ UI_theme_choice = "default",
1998
+ fit_canvas_choice = 0
1999
  ):
2000
  if args.lock_config:
2001
  return
 
2016
  "boost" : boost_choice,
2017
  "clear_file_list" : clear_file_list,
2018
  "preload_model_policy" : preload_model_policy_choice,
2019
+ "UI_theme" : UI_theme_choice,
2020
+ "fit_canvas": fit_canvas_choice,
2021
  }
2022
 
2023
  if Path(server_config_filename).is_file():
 
2051
  transformer_quantization = server_config["transformer_quantization"]
2052
  transformer_types = server_config["transformer_types"]
2053
 
2054
+ if all(change in ["attention_mode", "vae_config", "boost", "save_path", "metadata_type", "clear_file_list", "fit_canvas"] for change in changes ):
2055
  model_choice = gr.Dropdown()
2056
  else:
2057
  reload_needed = True
 
2414
  file_list = gen["file_list"]
2415
  prompt_no = gen["prompt_no"]
2416
 
2417
+ fit_canvas = server_config.get("fit_canvas", 0)
2418
  # if wan_model == None:
2419
  # gr.Info("Unable to generate a Video while a new configuration is being applied.")
2420
  # return
 
2556
  source_video = None
2557
  target_camera = None
2558
  if "recam" in model_filename:
2559
+ source_video = preprocess_video("", width=width, height=height,video_in=video_source, max_frames= video_length, start_frame = 0, fit_canvas= fit_canvas)
2560
  target_camera = model_mode
2561
 
2562
  audio_proj_split = None
 
2647
  elif diffusion_forcing:
2648
  if video_source != None and len(video_source) > 0 and window_no == 1:
2649
  keep_frames_video_source= 1000 if len(keep_frames_video_source) ==0 else int(keep_frames_video_source)
2650
+ prefix_video = preprocess_video(None, width=width, height=height,video_in=video_source, max_frames= keep_frames_video_source , start_frame = 0, fit_canvas= fit_canvas, target_fps = fps)
2651
  prefix_video = prefix_video .permute(3, 0, 1, 2)
2652
  prefix_video = prefix_video .float().div_(127.5).sub_(1.) # c, f, h, w
2653
  prefix_video_frames_count = prefix_video.shape[1]
 
2676
 
2677
  if preprocess_type != None :
2678
  send_cmd("progress", progress_args)
2679
+ video_guide_copy = preprocess_video(preprocess_type, width=width, height=height,video_in=video_guide, max_frames= video_length if window_no == 1 else video_length - reuse_frames, start_frame = guide_start_frame, fit_canvas = fit_canvas, target_fps = fps)
2680
  keep_frames_parsed, error = parse_keep_frames_video_guide(keep_frames_video_guide, max_frames_to_generate)
2681
  if len(error) > 0:
2682
  raise gr.Error(f"invalid keep frames {keep_frames_video_guide}")
2683
  keep_frames_parsed = keep_frames_parsed[guide_start_frame: guide_start_frame + video_length]
2684
  if window_no == 1:
2685
+ image_size = (height, width) # VACE_SIZE_CONFIGS[resolution_reformated] # default frame dimensions until it is set by video_src (if there is any)
2686
  src_video, src_mask, src_ref_images = wan_model.prepare_source([video_guide_copy],
2687
  [video_mask_copy ],
2688
  [image_refs_copy],
 
2690
  original_video= "O" in video_prompt_type,
2691
  keep_frames=keep_frames_parsed,
2692
  start_frame = guide_start_frame,
2693
+ pre_src_video = [pre_video_guide],
2694
+ fit_into_canvas = fit_canvas
2695
  )
2696
+ # if window_no == 1 and src_video != None and len(src_video) > 0:
2697
+ # image_size = src_video[0].shape[-2:]
2698
  prompts_max = gen["prompts_max"]
2699
  status = get_latest_status(state)
2700
 
 
2724
  # max_area=MAX_AREA_CONFIGS[resolution_reformated],
2725
  height = height,
2726
  width = width,
2727
+ fit_into_canvas = fit_canvas,
2728
  shift=flow_shift,
2729
  sampling_steps=num_inference_steps,
2730
  guide_scale=guidance_scale,
 
2753
  input_video= pre_video_guide,
2754
  height = height,
2755
  width = width,
2756
+ fit_into_canvas = fit_canvas,
2757
  seed = seed,
2758
  num_frames = (video_length // 4)* 4 + 1, #377
2759
  num_inference_steps = num_inference_steps,
 
2781
  target_camera= target_camera,
2782
  frame_num=(video_length // 4)* 4 + 1,
2783
  size=(width, height),
2784
+ fit_into_canvas = fit_canvas,
2785
  shift=flow_shift,
2786
  sampling_steps=num_inference_steps,
2787
  guide_scale=guidance_scale,
 
4047
  wizard_prompt_activated_var = gr.Text(wizard_prompt_activated, visible= False)
4048
  wizard_variables_var = gr.Text(wizard_variables, visible = False)
4049
  with gr.Row():
4050
+ if test_class_i2v(model_filename):
4051
+ if server_config.get("fit_canvas", 0) == 1:
4052
+ label = "Max Resolution (as it maybe less depending on video width / height ratio)"
4053
+ else:
4054
+ label = "Max Resolution (as it maybe less depending on video width / height ratio)"
 
 
 
 
 
4055
  else:
4056
+ label = "Max Resolution (as it maybe less depending on video width / height ratio)"
4057
+ resolution = gr.Dropdown(
4058
+ choices=[
4059
+ # 720p
4060
+ ("1280x720 (16:9, 720p)", "1280x720"),
4061
+ ("720x1280 (9:16, 720p)", "720x1280"),
4062
+ ("1024x1024 (4:3, 720p)", "1024x024"),
4063
+ ("832x1104 (3:4, 720p)", "832x1104"),
4064
+ ("1104x832 (3:4, 720p)", "1104x832"),
4065
+ ("960x960 (1:1, 720p)", "960x960"),
4066
+ # 480p
4067
+ ("960x544 (16:9, 540p)", "960x544"),
4068
+ ("544x960 (16:9, 540p)", "544x960"),
4069
+ ("832x480 (16:9, 480p)", "832x480"),
4070
+ ("480x832 (9:16, 480p)", "480x832"),
4071
+ ("832x624 (4:3, 480p)", "832x624"),
4072
+ ("624x832 (3:4, 480p)", "624x832"),
4073
+ ("720x720 (1:1, 480p)", "720x720"),
4074
+ ("512x512 (1:1, 480p)", "512x512"),
4075
+ ],
4076
+ value=ui_defaults.get("resolution","832x480"),
4077
+ label= label
4078
+ )
4079
  with gr.Row():
4080
  if recammaster:
4081
  video_length = gr.Slider(5, 193, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (16 = 1s), locked", interactive= False)
 
4557
  with gr.Column():
4558
  model_list = []
4559
 
 
 
 
 
 
 
 
 
 
 
 
4560
 
4561
+ with gr.Tabs():
4562
+ # with gr.Row(visible=advanced_ui) as advanced_row:
4563
+ with gr.Tab("General"):
4564
+ for model_type in model_types:
4565
+ choice = get_model_filename(model_type, transformer_quantization)
4566
+ model_list.append(choice)
4567
+ dropdown_choices = [ ( get_model_name(choice), get_model_type(choice) ) for choice in model_list]
4568
+ transformer_types_choices = gr.Dropdown(
4569
+ choices= dropdown_choices,
4570
+ value= transformer_types,
4571
+ label= "Selectable Wan Transformer Models (keep empty to get All of them)",
4572
+ scale= 2,
4573
+ multiselect= True
4574
+ )
4575
+
4576
+ fit_canvas_choice = gr.Dropdown(
4577
+ choices=[
4578
+ ("Dimensions correspond to the Pixels Budget (as the Prompt Image/Video will be resized to match this pixels budget, output video height or width may exceed the requested dimensions )", 0),
4579
+ ("Dimensions correspond to the Maximum Width and Height (as the Prompt Image/Video will be resized to fit into these dimensions, the output video may be smaller)", 1),
4580
+ ],
4581
+ value= server_config.get("fit_canvas", 0),
4582
+ label="Generated Video Dimensions when Prompt contains an Image or a Video",
4583
+ interactive= not lock_ui_attention
4584
+ )
4585
+
4586
+
4587
+ def check(mode):
4588
+ if not mode in attention_modes_installed:
4589
+ return " (NOT INSTALLED)"
4590
+ elif not mode in attention_modes_supported:
4591
+ return " (NOT SUPPORTED)"
4592
+ else:
4593
+ return ""
4594
+ attention_choice = gr.Dropdown(
4595
+ choices=[
4596
+ ("Auto : pick sage2 > sage > sdpa depending on what is installed", "auto"),
4597
+ ("Scale Dot Product Attention: default, always available", "sdpa"),
4598
+ ("Flash" + check("flash")+ ": good quality - requires additional install (usually complex to set up on Windows without WSL)", "flash"),
4599
+ ("Xformers" + check("xformers")+ ": good quality - requires additional install (usually complex, may consume less VRAM to set up on Windows without WSL)", "xformers"),
4600
+ ("Sage" + check("sage")+ ": 30% faster but slightly worse quality - requires additional install (usually complex to set up on Windows without WSL)", "sage"),
4601
+ ("Sage2" + check("sage2")+ ": 40% faster but slightly worse quality - requires additional install (usually complex to set up on Windows without WSL)", "sage2"),
4602
+ ],
4603
+ value= attention_mode,
4604
+ label="Attention Type",
4605
+ interactive= not lock_ui_attention
4606
+ )
4607
+
4608
+
4609
+ metadata_choice = gr.Dropdown(
4610
+ choices=[
4611
+ ("Export JSON files", "json"),
4612
+ ("Add metadata to video", "metadata"),
4613
+ ("Neither", "none")
4614
+ ],
4615
+ value=server_config.get("metadata_type", "metadata"),
4616
+ label="Metadata Handling"
4617
+ )
4618
+ preload_model_policy_choice = gr.CheckboxGroup([("Preload Model while Launching the App","P"), ("Preload Model while Switching Model", "S"), ("Unload Model when Queue is Done", "U")],
4619
+ value=server_config.get("preload_model_policy",[]),
4620
+ label="RAM Loading / Unloading Model Policy (in any case VRAM will be freed once the queue has been processed)"
4621
+ )
4622
+
4623
+ clear_file_list_choice = gr.Dropdown(
4624
+ choices=[
4625
+ ("None", 0),
4626
+ ("Keep the last video", 1),
4627
+ ("Keep the last 5 videos", 5),
4628
+ ("Keep the last 10 videos", 10),
4629
+ ("Keep the last 20 videos", 20),
4630
+ ("Keep the last 30 videos", 30),
4631
+ ],
4632
+ value=server_config.get("clear_file_list", 5),
4633
+ label="Keep Previously Generated Videos when starting a new Generation Batch"
4634
+ )
4635
+
4636
+ UI_theme_choice = gr.Dropdown(
4637
+ choices=[
4638
+ ("Blue Sky", "default"),
4639
+ ("Classic Gradio", "gradio"),
4640
+ ],
4641
+ value=server_config.get("UI_theme_choice", "default"),
4642
+ label="User Interface Theme. You will need to restart the App the see new Theme."
4643
+ )
4644
+
4645
+ save_path_choice = gr.Textbox(
4646
+ label="Output Folder for Generated Videos",
4647
+ value=server_config.get("save_path", save_path)
4648
+ )
4649
+
4650
+ with gr.Tab("Performance"):
4651
+
4652
+ quantization_choice = gr.Dropdown(
4653
+ choices=[
4654
+ ("Scaled Int8 Quantization (recommended)", "int8"),
4655
+ ("16 bits (no quantization)", "bf16"),
4656
+ ],
4657
+ value= transformer_quantization,
4658
+ label="Wan Transformer Model Quantization Type (if available)",
4659
+ )
4660
+
4661
+ mixed_precision_choice = gr.Dropdown(
4662
+ choices=[
4663
+ ("16 bits only, requires less VRAM", "0"),
4664
+ ("Mixed 16 / 32 bits, slightly more VRAM needed but better Quality", "1"),
4665
+ ],
4666
+ value= server_config.get("mixed_precision", "0"),
4667
+ label="Transformer Engine Calculation"
4668
+ )
4669
+
4670
+ index = text_encoder_choices.index(text_encoder_filename)
4671
+ index = 0 if index ==0 else index
4672
+ text_encoder_choice = gr.Dropdown(
4673
+ choices=[
4674
+ ("UMT5 XXL 16 bits - unquantized text encoder, better quality uses more RAM", 0),
4675
+ ("UMT5 XXL quantized to 8 bits - quantized text encoder, slightly worse quality but uses less RAM", 1),
4676
+ ],
4677
+ value= index,
4678
+ label="Text Encoder model"
4679
+ )
4680
+
4681
+ VAE_precision_choice = gr.Dropdown(
4682
+ choices=[
4683
+ ("16 bits, requires less VRAM and faster", "16"),
4684
+ ("32 bits, requires twice more VRAM and slower but recommended with Window Sliding", "32"),
4685
+ ],
4686
+ value= server_config.get("vae_precision", "16"),
4687
+ label="VAE Encoding / Decoding precision"
4688
+ )
4689
+
4690
+ gr.Text("Beware: when restarting the server or changing a resolution or video duration, the first step of generation for a duration / resolution may last a few minutes due to recompilation", interactive= False, show_label= False )
4691
+ compile_choice = gr.Dropdown(
4692
+ choices=[
4693
+ ("ON: works only on Linux / WSL", "transformer"),
4694
+ ("OFF: no other choice if you have Windows without using WSL", "" ),
4695
+ ],
4696
+ value= compile,
4697
+ label="Compile Transformer (up to 50% faster and 30% more frames but requires Linux / WSL and Flash or Sage attention)",
4698
+ interactive= not lock_ui_compile
4699
+ )
4700
+
4701
+ vae_config_choice = gr.Dropdown(
4702
+ choices=[
4703
+ ("Auto", 0),
4704
+ ("Disabled (faster but may require up to 22 GB of VRAM)", 1),
4705
+ ("256 x 256 : If at least 8 GB of VRAM", 2),
4706
+ ("128 x 128 : If at least 6 GB of VRAM", 3),
4707
+ ],
4708
+ value= vae_config,
4709
+ label="VAE Tiling - reduce the high VRAM requirements for VAE decoding and VAE encoding (if enabled it will be slower)"
4710
+ )
4711
+
4712
+ boost_choice = gr.Dropdown(
4713
+ choices=[
4714
+ # ("Auto (ON if Video longer than 5s)", 0),
4715
+ ("ON", 1),
4716
+ ("OFF", 2),
4717
+ ],
4718
+ value=boost,
4719
+ label="Boost: Give a 10% speedup without losing quality at the cost of a litle VRAM (up to 1GB at max frames and resolution)"
4720
+ )
4721
+
4722
+ profile_choice = gr.Dropdown(
4723
+ choices=[
4724
+ ("HighRAM_HighVRAM, profile 1: at least 48 GB of RAM and 24 GB of VRAM, the fastest for short videos a RTX 3090 / RTX 4090", 1),
4725
+ ("HighRAM_LowVRAM, profile 2 (Recommended): at least 48 GB of RAM and 12 GB of VRAM, the most versatile profile with high RAM, better suited for RTX 3070/3080/4070/4080 or for RTX 3090 / RTX 4090 with large pictures batches or long videos", 2),
4726
+ ("LowRAM_HighVRAM, profile 3: at least 32 GB of RAM and 24 GB of VRAM, adapted for RTX 3090 / RTX 4090 with limited RAM for good speed short video",3),
4727
+ ("LowRAM_LowVRAM, profile 4 (Default): at least 32 GB of RAM and 12 GB of VRAM, if you have little VRAM or want to generate longer videos",4),
4728
+ ("VerylowRAM_LowVRAM, profile 5: (Fail safe): at least 16 GB of RAM and 10 GB of VRAM, if you don't have much it won't be fast but maybe it will work",5)
4729
+ ],
4730
+ value= profile,
4731
+ label="Profile (for power users only, not needed to change it)"
4732
+ )
4733
 
 
 
 
 
 
 
 
 
 
 
 
 
4734
 
 
 
 
 
 
 
 
 
4735
 
4736
 
4737
  msg = gr.Markdown()
 
4754
  boost_choice,
4755
  clear_file_list_choice,
4756
  preload_model_policy_choice,
4757
+ UI_theme_choice,
4758
+ fit_canvas_choice
4759
  ],
4760
  outputs= [msg , header, model_choice]
4761
  )