DeepBeepMeep commited on
Commit
7e8fb61
·
1 Parent(s): 660ae45
hyvideo/hunyuan.py CHANGED
@@ -20,6 +20,7 @@ from PIL import Image
20
  import numpy as np
21
  import torchvision.transforms as transforms
22
  import cv2
 
23
 
24
  def pad_image(crop_img, size, color=(255, 255, 255), resize_ratio=1):
25
  crop_h, crop_w = crop_img.shape[:2]
@@ -606,6 +607,7 @@ class HunyuanVideoSampler(Inference):
606
  VAE_tile_size = None,
607
  joint_pass = False,
608
  cfg_star_switch = False,
 
609
  **kwargs,
610
  ):
611
 
@@ -737,12 +739,13 @@ class HunyuanVideoSampler(Inference):
737
 
738
  # semantic_images = [Image.open(i2v_image_path).convert('RGB')]
739
  semantic_images = [image_start.convert('RGB')] #
740
-
741
  origin_size = semantic_images[0].size
742
-
743
- crop_size_list = generate_crop_size_list(bucket_hw_base_size, 32)
744
- aspect_ratios = np.array([round(float(h)/float(w), 5) for h, w in crop_size_list])
745
- closest_size, closest_ratio = get_closest_ratio(origin_size[1], origin_size[0], aspect_ratios, crop_size_list)
 
 
746
  ref_image_transform = transforms.Compose([
747
  transforms.Resize(closest_size),
748
  transforms.CenterCrop(closest_size),
 
20
  import numpy as np
21
  import torchvision.transforms as transforms
22
  import cv2
23
+ from wan.utils.utils import resize_lanczos, calculate_new_dimensions
24
 
25
  def pad_image(crop_img, size, color=(255, 255, 255), resize_ratio=1):
26
  crop_h, crop_w = crop_img.shape[:2]
 
607
  VAE_tile_size = None,
608
  joint_pass = False,
609
  cfg_star_switch = False,
610
+ fit_into_canvas = True,
611
  **kwargs,
612
  ):
613
 
 
739
 
740
  # semantic_images = [Image.open(i2v_image_path).convert('RGB')]
741
  semantic_images = [image_start.convert('RGB')] #
 
742
  origin_size = semantic_images[0].size
743
+ h, w = origin_size
744
+ h, w = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
745
+ closest_size = (w, h)
746
+ # crop_size_list = generate_crop_size_list(bucket_hw_base_size, 32)
747
+ # aspect_ratios = np.array([round(float(h)/float(w), 5) for h, w in crop_size_list])
748
+ # closest_size, closest_ratio = get_closest_ratio(origin_size[1], origin_size[0], aspect_ratios, crop_size_list)
749
  ref_image_transform = transforms.Compose([
750
  transforms.Resize(closest_size),
751
  transforms.CenterCrop(closest_size),
ltx_video/utils/prompt_enhance_utils.py CHANGED
@@ -125,14 +125,15 @@ def _generate_t2v_prompt(
125
  )
126
  for m in messages
127
  ]
128
- model_inputs = prompt_enhancer_tokenizer(texts, return_tensors="pt").to(
129
- prompt_enhancer_model.device
130
- )
131
 
132
- return _generate_and_decode_prompts(
133
- prompt_enhancer_model, prompt_enhancer_tokenizer, model_inputs, max_new_tokens
134
- )
 
 
 
135
 
 
136
 
137
  def _generate_i2v_prompt(
138
  image_caption_model,
@@ -201,7 +202,7 @@ def _generate_and_decode_prompts(
201
  ) -> List[str]:
202
  with torch.inference_mode():
203
  outputs = prompt_enhancer_model.generate(
204
- **model_inputs, max_new_tokens=max_new_tokens
205
  )
206
  generated_ids = [
207
  output_ids[len(input_ids) :]
 
125
  )
126
  for m in messages
127
  ]
 
 
 
128
 
129
+ out_prompts = []
130
+ for text in texts:
131
+ model_inputs = prompt_enhancer_tokenizer(text, return_tensors="pt").to(
132
+ prompt_enhancer_model.device
133
+ )
134
+ out_prompts.append(_generate_and_decode_prompts(prompt_enhancer_model, prompt_enhancer_tokenizer, model_inputs, max_new_tokens)[0])
135
 
136
+ return out_prompts
137
 
138
  def _generate_i2v_prompt(
139
  image_caption_model,
 
202
  ) -> List[str]:
203
  with torch.inference_mode():
204
  outputs = prompt_enhancer_model.generate(
205
+ **model_inputs, max_new_tokens=max_new_tokens
206
  )
207
  generated_ids = [
208
  output_ids[len(input_ids) :]
preprocessing/dwpose/util.py CHANGED
@@ -3,8 +3,8 @@
3
  import math
4
  import numpy as np
5
  import matplotlib
6
- import cv2
7
-
8
 
9
  eps = 0.01
10
 
 
3
  import math
4
  import numpy as np
5
  import matplotlib
6
+ import cv2
7
+ matplotlib.use('TkAgg')
8
 
9
  eps = 0.01
10
 
wgp.py CHANGED
@@ -470,6 +470,9 @@ def get_preview_images(inputs):
470
  else:
471
  end_image_data = image
472
  break
 
 
 
473
  return start_image_data, end_image_data
474
 
475
  def add_video_task(**inputs):
@@ -1544,7 +1547,7 @@ def get_dependent_models(model_filename, quantization, dtype_policy ):
1544
  return [get_model_filename("ltxv_13B", quantization, dtype_policy)]
1545
  else:
1546
  return []
1547
- model_types = [ "t2v_1.3B", "vace_1.3B", "fun_inp_1.3B", "t2v", "i2v", "i2v_720p", "vace_14B", "fun_inp", "recam_1.3B", "flf2v_720p", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "phantom_1.3B", "fantasy", "ltxv_13B", "ltxv_13B_distilled", "hunyuan", "hunyuan_i2v", "hunyuan_custom"]
1548
  model_signatures = {"t2v": "text2video_14B", "t2v_1.3B" : "text2video_1.3B", "fun_inp_1.3B" : "Fun_InP_1.3B", "fun_inp" : "Fun_InP_14B",
1549
  "i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "vace_14B" : "Vace_14B","recam_1.3B": "recammaster_1.3B",
1550
  "flf2v_720p" : "FLF2V_720p", "sky_df_1.3B" : "sky_reels2_diffusion_forcing_1.3B", "sky_df_14B" : "sky_reels2_diffusion_forcing_14B",
@@ -1583,6 +1586,7 @@ def get_model_name(model_filename, description_container = [""]):
1583
  elif "image" in model_filename:
1584
  model_name = "Wan2.1 image2video"
1585
  model_name += " 720p" if "720p" in model_filename else " 480p"
 
1586
  if "720p" in model_filename:
1587
  description = "The standard Wan Image 2 Video specialized to generate 720p images. It also offers Start and End Image support (End Image is not supported in the original model but seems to work well)"
1588
  else:
@@ -1594,6 +1598,7 @@ def get_model_name(model_filename, description_container = [""]):
1594
  elif "FLF2V" in model_filename:
1595
  model_name = "Wan2.1 FLF2V"
1596
  model_name += " 720p" if "720p" in model_filename else " 480p"
 
1597
  description = "The First Last Frame 2 Video model is the official model Image 2 Video model that support Start and End frames."
1598
  elif "sky_reels2_diffusion_forcing" in model_filename:
1599
  model_name = "SkyReels2 Diffusion Forcing"
@@ -1612,19 +1617,19 @@ def get_model_name(model_filename, description_container = [""]):
1612
  model_name += " 14B" if "14B" in model_filename else " 1.3B"
1613
  description = "The Fantasy Speaking model corresponds to the original Wan image 2 video model combined with the Fantasy Speaking extension to process an audio Input."
1614
  elif "ltxv_0.9.7_13B_dev" in model_filename:
1615
- model_name = "LTX Video 0.9.7"
1616
  description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).It is recommended to keep the number of steps to 30 or you will need to update the file 'ltxv_video/configs/ltxv-13b-0.9.7-dev.yaml'.The LTX Video model expects very long prompts, so don't hesitate to use the Prompt Enhancer."
1617
  elif "ltxv_0.9.7_13B_distilled" in model_filename:
1618
- model_name = "LTX Video 0.9.7 Distilled"
1619
  description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).This distilled version is a very fast version and retains a high level of quality. The LTX Video model expects very long prompts, so don't hesitate to use the Prompt Enhancer."
1620
  elif "hunyuan_video_720" in model_filename:
1621
- model_name = "Hunyuan Video text2video 720p"
1622
  description = "Probably the best text 2 video model available."
1623
  elif "hunyuan_video_i2v" in model_filename:
1624
- model_name = "Hunyuan Video image2video 720p"
1625
  description = "A good looking image 2 video model, but not so good in prompt adherence."
1626
  elif "hunyuan_video_custom" in model_filename:
1627
- model_name = "Hunyuan Video Custom 720p"
1628
  description = "The Hunyuan Video Custom model is proably the best model to transfer people (only people for the momment) as it is quite good to keep their identity. However it is slow as to get good results, you need to generate 720p videos with 30 steps."
1629
  else:
1630
  model_name = "Wan2.1 text2video"
@@ -2147,25 +2152,24 @@ def load_models(model_filename):
2147
  else:
2148
  raise Exception(f"Model '{new_transformer_filename}' not supported.")
2149
  wan_model._model_file_name = new_transformer_filename
2150
- kwargs = { "extraModelsToQuantize": None}
2151
- if profile == 2 or profile == 4:
2152
- kwargs["budgets"] = { "transformer" : 100 if preload == 0 else preload, "text_encoder" : 100 if preload == 0 else preload, "*" : max(3000, preload) }
2153
- # if profile == 4:
2154
- # kwargs["partialPinning"] = True
2155
  elif profile == 3:
2156
  kwargs["budgets"] = { "*" : "70%" }
2157
-
2158
  global prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer
2159
  if server_config.get("enhancer_enabled", 0) == 1:
2160
  from transformers import ( AutoModelForCausalLM, AutoProcessor, AutoTokenizer, LlamaForCausalLM )
2161
  prompt_enhancer_image_caption_model = AutoModelForCausalLM.from_pretrained( "ckpts/Florence2", trust_remote_code=True)
2162
  prompt_enhancer_image_caption_processor = AutoProcessor.from_pretrained( "ckpts/Florence2", trust_remote_code=True)
2163
- prompt_enhancer_llm_model = offload.fast_load_transformers_model("ckpts/Llama3_2/Llama3_2_quanto_bf16_int8.safetensors")
2164
  prompt_enhancer_llm_tokenizer = AutoTokenizer.from_pretrained("ckpts/Llama3_2")
2165
  pipe["prompt_enhancer_image_caption_model"] = prompt_enhancer_image_caption_model
2166
  pipe["prompt_enhancer_llm_model"] = prompt_enhancer_llm_model
2167
  prompt_enhancer_image_caption_model._model_dtype = torch.float
2168
- kwargs["budgets"]["prompt_enhancer_llm_model"] = 5000
 
2169
  else:
2170
  prompt_enhancer_image_caption_model = None
2171
  prompt_enhancer_image_caption_processor = None
@@ -2318,7 +2322,7 @@ def apply_changes( state,
2318
  model_choice = generate_dropdown_model_list()
2319
 
2320
  header = generate_header(state["model_filename"], compile=compile, attention_mode= attention_mode)
2321
- return "<DIV ALIGN=CENTER>The new configuration has been succesfully applied</DIV>", header, model_choice, gr.update(visible= server_config["enhancer_enabled"] == 1)
2322
 
2323
 
2324
 
@@ -2365,23 +2369,23 @@ def build_callback(state, pipe, send_cmd, status, num_inference_steps):
2365
  step_idx += 1
2366
  if gen.get("abort", False):
2367
  # pipe._interrupt = True
2368
- phase = " - Aborting"
2369
  elif step_idx == num_inference_steps:
2370
- phase = " - VAE Decoding"
2371
  else:
2372
  if pass_no <=0:
2373
- phase = " - Denoising"
2374
  elif pass_no == 1:
2375
- phase = " - Denoising First Pass"
2376
  elif pass_no == 2:
2377
- phase = " - Denoising Second Pass"
2378
  elif pass_no == 3:
2379
- phase = " - Denoising Third Pass"
2380
  else:
2381
- phase = f" - Denoising {pass_no}th Pass"
2382
 
2383
  gen["progress_phase"] = (phase, step_idx)
2384
- status_msg = status + phase
2385
  if step_idx >= 0:
2386
  progress_args = [(step_idx , num_inference_steps) , status_msg , num_inference_steps]
2387
  else:
@@ -2444,7 +2448,7 @@ def refresh_gallery(state): #, msg
2444
  window_no = len(prompts)
2445
  window_no -= 1
2446
  prompts[window_no]="<B>" + prompts[window_no] + "</B>"
2447
- prompt = "<BR>".join(prompts)
2448
  if enhanced:
2449
  prompt = "<U><B>Enhanced:</B></U><BR>" + prompt
2450
 
@@ -2799,7 +2803,7 @@ def generate_video(
2799
  hunyuan_t2v = "hunyuan_video_720" in model_filename
2800
  hunyuan_i2v = "hunyuan_video_i2v" in model_filename
2801
  hunyuan_custom = "hunyuan_video_custom" in model_filename
2802
- if diffusion_forcing or hunyuan_t2v or hunyuan_i2v:
2803
  fps = 24
2804
  elif audio_guide != None:
2805
  fps = 23
@@ -2810,7 +2814,7 @@ def generate_video(
2810
 
2811
  original_image_refs = image_refs
2812
  if image_refs != None and len(image_refs) > 0 and (hunyuan_custom or phantom or vace):
2813
- send_cmd("progress", [0, get_latest_status(state) + " - Removing Images References Background"])
2814
  os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
2815
  from wan.utils.utils import resize_and_remove_background
2816
  image_refs = resize_and_remove_background(image_refs, width, height, remove_background_image_ref ==1, fit_into_canvas= not vace)
@@ -2840,7 +2844,7 @@ def generate_video(
2840
  source_video = None
2841
  target_camera = None
2842
  if "recam" in model_filename:
2843
- source_video = preprocess_video("", width=width, height=height,video_in=video_source, max_frames= video_length, start_frame = 0, fit_canvas= fit_canvas)
2844
  target_camera = model_mode
2845
 
2846
  audio_proj_split = None
@@ -2909,10 +2913,12 @@ def generate_video(
2909
  guide_start_frame = 0
2910
  video_length = first_window_video_length
2911
  gen["extra_windows"] = 0
 
 
2912
  start_time = time.time()
2913
  if prompt_enhancer_image_caption_model != None and prompt_enhancer !=None and len(prompt_enhancer)>0:
2914
  text_encoder_max_tokens = 256
2915
- send_cmd("progress", [0, get_latest_status(state) + " - Enhancing Prompt"])
2916
  from ltx_video.utils.prompt_enhance_utils import generate_cinematic_prompt
2917
  prompt_images = []
2918
  if "I" in prompt_enhancer:
@@ -2976,17 +2982,15 @@ def generate_video(
2976
  if any(process in video_prompt_type for process in ("P", "D", "G")) :
2977
  prompts_max = gen["prompts_max"]
2978
 
2979
- status = get_latest_status(state)
2980
-
2981
  preprocess_type = None
2982
  if "P" in video_prompt_type :
2983
- progress_args = [0, status + " - Extracting Open Pose Information"]
2984
  preprocess_type = "pose"
2985
  elif "D" in video_prompt_type :
2986
- progress_args = [0, status + " - Extracting Depth Information"]
2987
  preprocess_type = "depth"
2988
  elif "G" in video_prompt_type :
2989
- progress_args = [0, status + " - Extracting Gray Level Information"]
2990
  preprocess_type = "gray"
2991
 
2992
  if preprocess_type != None :
@@ -3015,9 +3019,9 @@ def generate_video(
3015
 
3016
 
3017
  gen["progress_status"] = status
3018
- gen["progress_phase"] = (" - Encoding Prompt", -1 )
3019
  callback = build_callback(state, trans, send_cmd, status, num_inference_steps)
3020
- progress_args = [0, status + " - Encoding Prompt"]
3021
  send_cmd("progress", progress_args)
3022
 
3023
  # samples = torch.empty( (1,2)) #for testing
@@ -3043,7 +3047,7 @@ def generate_video(
3043
  frame_num=(video_length // 4)* 4 + 1,
3044
  height = height,
3045
  width = width,
3046
- fit_into_canvas = fit_canvas,
3047
  shift=flow_shift,
3048
  sampling_steps=num_inference_steps,
3049
  guide_scale=guidance_scale,
@@ -3151,14 +3155,8 @@ def generate_video(
3151
  sample = sample[: , reuse_frames:]
3152
 
3153
  guide_start_frame -= reuse_frames
3154
- time_flag = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d-%Hh%Mm%Ss")
3155
- if os.name == 'nt':
3156
- file_name = f"{time_flag}_seed{seed}_{sanitize_file_name(prompt[:50]).strip()}.mp4"
3157
- else:
3158
- file_name = f"{time_flag}_seed{seed}_{sanitize_file_name(prompt[:100]).strip()}.mp4"
3159
- video_path = os.path.join(save_path, file_name)
3160
- exp = 0
3161
 
 
3162
  if len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0:
3163
  progress_args = [(num_inference_steps , num_inference_steps) , status + " - Upsampling" , num_inference_steps]
3164
  send_cmd("progress", progress_args)
@@ -3208,6 +3206,14 @@ def generate_video(
3208
  sample = torch.cat([frames_already_processed, sample], dim=1)
3209
  frames_already_processed = sample
3210
 
 
 
 
 
 
 
 
 
3211
  if audio_guide == None:
3212
  cache_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1))
3213
  else:
@@ -3223,8 +3229,10 @@ def generate_video(
3223
  inputs = get_function_arguments(generate_video, locals())
3224
  inputs.pop("send_cmd")
3225
  inputs.pop("task")
3226
- inputs["prompt"] = "\n".join(prompts)
3227
  configs = prepare_inputs_dict("metadata", inputs)
 
 
 
3228
  configs["generation_time"] = round(end_time-start_time)
3229
  metadata_choice = server_config.get("metadata_type","metadata")
3230
  if metadata_choice == "json":
@@ -3474,9 +3482,6 @@ def generate_preview(latents):
3474
  scale = 200 / h
3475
  images= Image.fromarray(images)
3476
  images = images.resize(( int(w*scale),int(h*scale)), resample=Image.Resampling.BILINEAR)
3477
- if images != None:
3478
- images.save("prepreview.png")
3479
-
3480
  return images
3481
 
3482
 
@@ -3591,7 +3596,7 @@ def process_tasks(state):
3591
  def get_generation_status(prompt_no, prompts_max, repeat_no, repeat_max, window_no, total_windows):
3592
  if prompts_max == 1:
3593
  if repeat_max == 1:
3594
- status = "Video"
3595
  else:
3596
  status = f"Sample {repeat_no}/{repeat_max}"
3597
  else:
@@ -3600,7 +3605,9 @@ def get_generation_status(prompt_no, prompts_max, repeat_no, repeat_max, window_
3600
  else:
3601
  status = f"Prompt {prompt_no}/{prompts_max}, Sample {repeat_no}/{repeat_max}"
3602
  if total_windows > 1:
3603
- status += f", Sliding Window {window_no}/{total_windows}"
 
 
3604
 
3605
  return status
3606
 
@@ -3611,7 +3618,15 @@ def get_new_refresh_id():
3611
  refresh_id += 1
3612
  return refresh_id
3613
 
3614
- def get_latest_status(state):
 
 
 
 
 
 
 
 
3615
  gen = get_gen_info(state)
3616
  prompt_no = gen["prompt_no"]
3617
  prompts_max = gen.get("prompts_max",0)
@@ -3622,7 +3637,7 @@ def get_latest_status(state):
3622
  total_windows += gen.get("extra_windows", 0)
3623
  window_no = gen.get("window_no", 0)
3624
  status = get_generation_status(prompt_no, prompts_max, repeat_no, total_generation, window_no, total_windows)
3625
- return status
3626
 
3627
  def update_status(state):
3628
  gen = get_gen_info(state)
@@ -4354,8 +4369,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
4354
  hunyuan_t2v = "hunyuan_video_720" in model_filename
4355
  hunyuan_i2v = "hunyuan_video_i2v" in model_filename
4356
  hunyuan_video_custom = "hunyuan_video_custom" in model_filename
4357
-
4358
-
4359
 
4360
  with gr.Column(visible= test_class_i2v(model_filename) or diffusion_forcing or ltxv or recammaster) as image_prompt_column:
4361
  if diffusion_forcing or ltxv:
@@ -4484,7 +4499,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
4484
  default_wizard_prompt, variables, values, errors = extract_wizard_prompt(launch_prompt)
4485
  advanced_prompt = len(errors) > 0
4486
  with gr.Column(visible= advanced_prompt) as prompt_column_advanced:
4487
- prompt = gr.Textbox( visible= advanced_prompt, label="Prompts (each new line of prompt will generate a new video, # lines = comments, ! lines = macros)", value=launch_prompt, lines=3)
4488
 
4489
  with gr.Column(visible=not advanced_prompt and len(variables) > 0) as prompt_column_wizard_vars:
4490
  gr.Markdown("<B>Please fill the following input fields to adapt automatically the Prompt:</B>")
@@ -4500,9 +4515,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
4500
  wizard_variables = "\n".join(variables)
4501
  for _ in range( PROMPT_VARS_MAX - len(prompt_vars)):
4502
  prompt_vars.append(gr.Textbox(visible= False, min_width=80, show_label= False))
4503
-
4504
  with gr.Column(not advanced_prompt) as prompt_column_wizard:
4505
- wizard_prompt = gr.Textbox(visible = not advanced_prompt, label="Prompts (each new line of prompt will generate a new video, # lines = comments)", value=default_wizard_prompt, lines=3)
4506
  wizard_prompt_activated_var = gr.Text(wizard_prompt_activated, visible= False)
4507
  wizard_variables_var = gr.Text(wizard_variables, visible = False)
4508
  with gr.Row(visible= server_config.get("enhancer_enabled", 0) == 1 ) as prompt_enhancer_row:
@@ -4688,8 +4702,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
4688
  )
4689
  with gr.Row():
4690
  cfg_zero_step = gr.Slider(-1, 39, value=ui_defaults.get("cfg_zero_step",-1), step=1, label="CFG Zero below this Layer (Extra Process)")
4691
-
4692
- with gr.Tab("Sliding Window", visible= vace or diffusion_forcing or ltxv) as sliding_window_tab:
4693
 
4694
  with gr.Column():
4695
  gr.Markdown("<B>A Sliding Window allows you to generate video with a duration not limited by the Model</B>")
@@ -4701,7 +4714,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
4701
  sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, visible = False)
4702
  elif ltxv:
4703
  sliding_window_size = gr.Slider(41, 257, value=ui_defaults.get("sliding_window_size", 129), step=8, label="Sliding Window Size")
4704
- sliding_window_overlap = gr.Slider(9, 97, value=ui_defaults.get("sliding_window_overlap",17), step=8, label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)")
4705
  sliding_window_overlap_noise = gr.Slider(0, 100, value=ui_defaults.get("sliding_window_overlap_noise",20), step=1, label="Noise to be added to overlapped frames to reduce blur effect")
4706
  sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, visible = False)
4707
  else:
@@ -5159,7 +5172,7 @@ def generate_configuration_tab(state, blocks, header, model_choice, prompt_enhan
5159
  mixed_precision_choice = gr.Dropdown(
5160
  choices=[
5161
  ("16 bits only, requires less VRAM", "0"),
5162
- ("Mixed 16 / 32 bits, slightly more VRAM needed but better Quality", "1"),
5163
  ],
5164
  value= server_config.get("mixed_precision", "0"),
5165
  label="Transformer Engine Calculation"
@@ -5637,7 +5650,7 @@ def create_demo():
5637
  else:
5638
  theme = gr.themes.Soft(font=["Verdana"], primary_hue="sky", neutral_hue="slate", text_size="md")
5639
 
5640
- with gr.Blocks(css=css, theme=theme, title= "Wan2GP") as main:
5641
  gr.Markdown("<div align=center><H1>Wan<SUP>GP</SUP> v5.1 <FONT SIZE=4>by <I>DeepBeepMeep</I></FONT> <FONT SIZE=3>") # (<A HREF='https://github.com/deepbeepmeep/Wan2GP'>Updates</A>)</FONT SIZE=3></H1></div>")
5642
  global model_list
5643
 
 
470
  else:
471
  end_image_data = image
472
  break
473
+ if start_image_data != None and len(start_image_data) > 1 and end_image_data == None:
474
+ end_image_data = start_image_data [1:]
475
+ start_image_data = start_image_data [:1]
476
  return start_image_data, end_image_data
477
 
478
  def add_video_task(**inputs):
 
1547
  return [get_model_filename("ltxv_13B", quantization, dtype_policy)]
1548
  else:
1549
  return []
1550
+ model_types = [ "t2v_1.3B", "t2v", "i2v", "i2v_720p", "flf2v_720p", "vace_1.3B","vace_14B", "phantom_1.3B", "fantasy", "fun_inp_1.3B", "fun_inp", "recam_1.3B", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "ltxv_13B", "ltxv_13B_distilled", "hunyuan", "hunyuan_i2v", "hunyuan_custom"]
1551
  model_signatures = {"t2v": "text2video_14B", "t2v_1.3B" : "text2video_1.3B", "fun_inp_1.3B" : "Fun_InP_1.3B", "fun_inp" : "Fun_InP_14B",
1552
  "i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "vace_14B" : "Vace_14B","recam_1.3B": "recammaster_1.3B",
1553
  "flf2v_720p" : "FLF2V_720p", "sky_df_1.3B" : "sky_reels2_diffusion_forcing_1.3B", "sky_df_14B" : "sky_reels2_diffusion_forcing_14B",
 
1586
  elif "image" in model_filename:
1587
  model_name = "Wan2.1 image2video"
1588
  model_name += " 720p" if "720p" in model_filename else " 480p"
1589
+ model_name += " 14B"
1590
  if "720p" in model_filename:
1591
  description = "The standard Wan Image 2 Video specialized to generate 720p images. It also offers Start and End Image support (End Image is not supported in the original model but seems to work well)"
1592
  else:
 
1598
  elif "FLF2V" in model_filename:
1599
  model_name = "Wan2.1 FLF2V"
1600
  model_name += " 720p" if "720p" in model_filename else " 480p"
1601
+ model_name += " 14B"
1602
  description = "The First Last Frame 2 Video model is the official model Image 2 Video model that support Start and End frames."
1603
  elif "sky_reels2_diffusion_forcing" in model_filename:
1604
  model_name = "SkyReels2 Diffusion Forcing"
 
1617
  model_name += " 14B" if "14B" in model_filename else " 1.3B"
1618
  description = "The Fantasy Speaking model corresponds to the original Wan image 2 video model combined with the Fantasy Speaking extension to process an audio Input."
1619
  elif "ltxv_0.9.7_13B_dev" in model_filename:
1620
+ model_name = "LTX Video 0.9.7 13B"
1621
  description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).It is recommended to keep the number of steps to 30 or you will need to update the file 'ltxv_video/configs/ltxv-13b-0.9.7-dev.yaml'.The LTX Video model expects very long prompts, so don't hesitate to use the Prompt Enhancer."
1622
  elif "ltxv_0.9.7_13B_distilled" in model_filename:
1623
+ model_name = "LTX Video 0.9.7 Distilled 13B"
1624
  description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).This distilled version is a very fast version and retains a high level of quality. The LTX Video model expects very long prompts, so don't hesitate to use the Prompt Enhancer."
1625
  elif "hunyuan_video_720" in model_filename:
1626
+ model_name = "Hunyuan Video text2video 720p 13B"
1627
  description = "Probably the best text 2 video model available."
1628
  elif "hunyuan_video_i2v" in model_filename:
1629
+ model_name = "Hunyuan Video image2video 720p 13B"
1630
  description = "A good looking image 2 video model, but not so good in prompt adherence."
1631
  elif "hunyuan_video_custom" in model_filename:
1632
+ model_name = "Hunyuan Video Custom 720p 13B"
1633
  description = "The Hunyuan Video Custom model is proably the best model to transfer people (only people for the momment) as it is quite good to keep their identity. However it is slow as to get good results, you need to generate 720p videos with 30 steps."
1634
  else:
1635
  model_name = "Wan2.1 text2video"
 
2152
  else:
2153
  raise Exception(f"Model '{new_transformer_filename}' not supported.")
2154
  wan_model._model_file_name = new_transformer_filename
2155
+ kwargs = { "extraModelsToQuantize": None }
2156
+ if profile in (2, 4, 5):
2157
+ kwargs["budgets"] = { "transformer" : 100 if preload == 0 else preload, "text_encoder" : 100 if preload == 0 else preload, "*" : max(1000 if profile==5 else 3000 , preload) }
 
 
2158
  elif profile == 3:
2159
  kwargs["budgets"] = { "*" : "70%" }
2160
+
2161
  global prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer
2162
  if server_config.get("enhancer_enabled", 0) == 1:
2163
  from transformers import ( AutoModelForCausalLM, AutoProcessor, AutoTokenizer, LlamaForCausalLM )
2164
  prompt_enhancer_image_caption_model = AutoModelForCausalLM.from_pretrained( "ckpts/Florence2", trust_remote_code=True)
2165
  prompt_enhancer_image_caption_processor = AutoProcessor.from_pretrained( "ckpts/Florence2", trust_remote_code=True)
2166
+ prompt_enhancer_llm_model = offload.fast_load_transformers_model("ckpts/Llama3_2/Llama3_2_quanto_bf16_int8.safetensors") #, configKwargs= {"_attn_implementation" :"XXXsdpa"}
2167
  prompt_enhancer_llm_tokenizer = AutoTokenizer.from_pretrained("ckpts/Llama3_2")
2168
  pipe["prompt_enhancer_image_caption_model"] = prompt_enhancer_image_caption_model
2169
  pipe["prompt_enhancer_llm_model"] = prompt_enhancer_llm_model
2170
  prompt_enhancer_image_caption_model._model_dtype = torch.float
2171
+ if "budgets" in kwargs:
2172
+ kwargs["budgets"]["prompt_enhancer_llm_model"] = 5000
2173
  else:
2174
  prompt_enhancer_image_caption_model = None
2175
  prompt_enhancer_image_caption_processor = None
 
2322
  model_choice = generate_dropdown_model_list()
2323
 
2324
  header = generate_header(state["model_filename"], compile=compile, attention_mode= attention_mode)
2325
+ return "<DIV ALIGN=CENTER>The new configuration has been succesfully applied</DIV>", header, model_choice, gr.Row(visible= server_config["enhancer_enabled"] == 1)
2326
 
2327
 
2328
 
 
2369
  step_idx += 1
2370
  if gen.get("abort", False):
2371
  # pipe._interrupt = True
2372
+ phase = "Aborting"
2373
  elif step_idx == num_inference_steps:
2374
+ phase = "VAE Decoding"
2375
  else:
2376
  if pass_no <=0:
2377
+ phase = "Denoising"
2378
  elif pass_no == 1:
2379
+ phase = "Denoising First Pass"
2380
  elif pass_no == 2:
2381
+ phase = "Denoising Second Pass"
2382
  elif pass_no == 3:
2383
+ phase = "Denoising Third Pass"
2384
  else:
2385
+ phase = f"Denoising {pass_no}th Pass"
2386
 
2387
  gen["progress_phase"] = (phase, step_idx)
2388
+ status_msg = merge_status_context(status, phase)
2389
  if step_idx >= 0:
2390
  progress_args = [(step_idx , num_inference_steps) , status_msg , num_inference_steps]
2391
  else:
 
2448
  window_no = len(prompts)
2449
  window_no -= 1
2450
  prompts[window_no]="<B>" + prompts[window_no] + "</B>"
2451
+ prompt = "<BR><DIV style='height:8px'></DIV>".join(prompts)
2452
  if enhanced:
2453
  prompt = "<U><B>Enhanced:</B></U><BR>" + prompt
2454
 
 
2803
  hunyuan_t2v = "hunyuan_video_720" in model_filename
2804
  hunyuan_i2v = "hunyuan_video_i2v" in model_filename
2805
  hunyuan_custom = "hunyuan_video_custom" in model_filename
2806
+ if diffusion_forcing or hunyuan_t2v or hunyuan_i2v or hunyuan_custom:
2807
  fps = 24
2808
  elif audio_guide != None:
2809
  fps = 23
 
2814
 
2815
  original_image_refs = image_refs
2816
  if image_refs != None and len(image_refs) > 0 and (hunyuan_custom or phantom or vace):
2817
+ send_cmd("progress", [0, get_latest_status(state, "Removing Images References Background")])
2818
  os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
2819
  from wan.utils.utils import resize_and_remove_background
2820
  image_refs = resize_and_remove_background(image_refs, width, height, remove_background_image_ref ==1, fit_into_canvas= not vace)
 
2844
  source_video = None
2845
  target_camera = None
2846
  if "recam" in model_filename:
2847
+ source_video = preprocess_video("", width=width, height=height,video_in=video_source, max_frames= video_length, start_frame = 0, fit_canvas= fit_canvas == 1)
2848
  target_camera = model_mode
2849
 
2850
  audio_proj_split = None
 
2913
  guide_start_frame = 0
2914
  video_length = first_window_video_length
2915
  gen["extra_windows"] = 0
2916
+ gen["total_windows"] = 1
2917
+ gen["window_no"] = 1
2918
  start_time = time.time()
2919
  if prompt_enhancer_image_caption_model != None and prompt_enhancer !=None and len(prompt_enhancer)>0:
2920
  text_encoder_max_tokens = 256
2921
+ send_cmd("progress", [0, get_latest_status(state, "Enhancing Prompt")])
2922
  from ltx_video.utils.prompt_enhance_utils import generate_cinematic_prompt
2923
  prompt_images = []
2924
  if "I" in prompt_enhancer:
 
2982
  if any(process in video_prompt_type for process in ("P", "D", "G")) :
2983
  prompts_max = gen["prompts_max"]
2984
 
 
 
2985
  preprocess_type = None
2986
  if "P" in video_prompt_type :
2987
+ progress_args = [0, get_latest_status(state,"Extracting Open Pose Information")]
2988
  preprocess_type = "pose"
2989
  elif "D" in video_prompt_type :
2990
+ progress_args = [0, get_latest_status(state,"Extracting Depth Information")]
2991
  preprocess_type = "depth"
2992
  elif "G" in video_prompt_type :
2993
+ progress_args = [0, get_latest_status(state,"Extracting Gray Level Information")]
2994
  preprocess_type = "gray"
2995
 
2996
  if preprocess_type != None :
 
3019
 
3020
 
3021
  gen["progress_status"] = status
3022
+ gen["progress_phase"] = ("Encoding Prompt", -1 )
3023
  callback = build_callback(state, trans, send_cmd, status, num_inference_steps)
3024
+ progress_args = [0, merge_status_context(status, "Encoding Prompt")]
3025
  send_cmd("progress", progress_args)
3026
 
3027
  # samples = torch.empty( (1,2)) #for testing
 
3047
  frame_num=(video_length // 4)* 4 + 1,
3048
  height = height,
3049
  width = width,
3050
+ fit_into_canvas = fit_canvas == 1,
3051
  shift=flow_shift,
3052
  sampling_steps=num_inference_steps,
3053
  guide_scale=guidance_scale,
 
3155
  sample = sample[: , reuse_frames:]
3156
 
3157
  guide_start_frame -= reuse_frames
 
 
 
 
 
 
 
3158
 
3159
+ exp = 0
3160
  if len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0:
3161
  progress_args = [(num_inference_steps , num_inference_steps) , status + " - Upsampling" , num_inference_steps]
3162
  send_cmd("progress", progress_args)
 
3206
  sample = torch.cat([frames_already_processed, sample], dim=1)
3207
  frames_already_processed = sample
3208
 
3209
+ time_flag = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d-%Hh%Mm%Ss")
3210
+ save_prompt = original_prompts[0]
3211
+ if os.name == 'nt':
3212
+ file_name = f"{time_flag}_seed{seed}_{sanitize_file_name(save_prompt[:50]).strip()}.mp4"
3213
+ else:
3214
+ file_name = f"{time_flag}_seed{seed}_{sanitize_file_name(save_prompt[:100]).strip()}.mp4"
3215
+ video_path = os.path.join(save_path, file_name)
3216
+
3217
  if audio_guide == None:
3218
  cache_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1))
3219
  else:
 
3229
  inputs = get_function_arguments(generate_video, locals())
3230
  inputs.pop("send_cmd")
3231
  inputs.pop("task")
 
3232
  configs = prepare_inputs_dict("metadata", inputs)
3233
+ configs["prompt"] = "\n".join(original_prompts)
3234
+ if prompt_enhancer_image_caption_model != None:
3235
+ configs["enhanced_prompt"] = "\n".join(prompts)
3236
  configs["generation_time"] = round(end_time-start_time)
3237
  metadata_choice = server_config.get("metadata_type","metadata")
3238
  if metadata_choice == "json":
 
3482
  scale = 200 / h
3483
  images= Image.fromarray(images)
3484
  images = images.resize(( int(w*scale),int(h*scale)), resample=Image.Resampling.BILINEAR)
 
 
 
3485
  return images
3486
 
3487
 
 
3596
  def get_generation_status(prompt_no, prompts_max, repeat_no, repeat_max, window_no, total_windows):
3597
  if prompts_max == 1:
3598
  if repeat_max == 1:
3599
+ status = ""
3600
  else:
3601
  status = f"Sample {repeat_no}/{repeat_max}"
3602
  else:
 
3605
  else:
3606
  status = f"Prompt {prompt_no}/{prompts_max}, Sample {repeat_no}/{repeat_max}"
3607
  if total_windows > 1:
3608
+ if len(status) > 0:
3609
+ status += ", "
3610
+ status += f"Sliding Window {window_no}/{total_windows}"
3611
 
3612
  return status
3613
 
 
3618
  refresh_id += 1
3619
  return refresh_id
3620
 
3621
+ def merge_status_context(status="", context=""):
3622
+ if len(status) == 0:
3623
+ return context
3624
+ elif len(context) == 0:
3625
+ return status
3626
+ else:
3627
+ return status + " - " + context
3628
+
3629
+ def get_latest_status(state, context=""):
3630
  gen = get_gen_info(state)
3631
  prompt_no = gen["prompt_no"]
3632
  prompts_max = gen.get("prompts_max",0)
 
3637
  total_windows += gen.get("extra_windows", 0)
3638
  window_no = gen.get("window_no", 0)
3639
  status = get_generation_status(prompt_no, prompts_max, repeat_no, total_generation, window_no, total_windows)
3640
+ return merge_status_context(status, context)
3641
 
3642
  def update_status(state):
3643
  gen = get_gen_info(state)
 
4369
  hunyuan_t2v = "hunyuan_video_720" in model_filename
4370
  hunyuan_i2v = "hunyuan_video_i2v" in model_filename
4371
  hunyuan_video_custom = "hunyuan_video_custom" in model_filename
4372
+ sliding_window_enabled = vace or diffusion_forcing or ltxv
4373
+ new_line_text = "each new line of prompt will be used for a window" if sliding_window_enabled else "each new line of prompt will generate a new video"
4374
 
4375
  with gr.Column(visible= test_class_i2v(model_filename) or diffusion_forcing or ltxv or recammaster) as image_prompt_column:
4376
  if diffusion_forcing or ltxv:
 
4499
  default_wizard_prompt, variables, values, errors = extract_wizard_prompt(launch_prompt)
4500
  advanced_prompt = len(errors) > 0
4501
  with gr.Column(visible= advanced_prompt) as prompt_column_advanced:
4502
+ prompt = gr.Textbox( visible= advanced_prompt, label="Prompts (" + new_line_text + ", # lines = comments, ! lines = macros)", value=launch_prompt, lines=3)
4503
 
4504
  with gr.Column(visible=not advanced_prompt and len(variables) > 0) as prompt_column_wizard_vars:
4505
  gr.Markdown("<B>Please fill the following input fields to adapt automatically the Prompt:</B>")
 
4515
  wizard_variables = "\n".join(variables)
4516
  for _ in range( PROMPT_VARS_MAX - len(prompt_vars)):
4517
  prompt_vars.append(gr.Textbox(visible= False, min_width=80, show_label= False))
 
4518
  with gr.Column(not advanced_prompt) as prompt_column_wizard:
4519
+ wizard_prompt = gr.Textbox(visible = not advanced_prompt, label="Prompts (" + new_line_text + ", # lines = comments)", value=default_wizard_prompt, lines=3)
4520
  wizard_prompt_activated_var = gr.Text(wizard_prompt_activated, visible= False)
4521
  wizard_variables_var = gr.Text(wizard_variables, visible = False)
4522
  with gr.Row(visible= server_config.get("enhancer_enabled", 0) == 1 ) as prompt_enhancer_row:
 
4702
  )
4703
  with gr.Row():
4704
  cfg_zero_step = gr.Slider(-1, 39, value=ui_defaults.get("cfg_zero_step",-1), step=1, label="CFG Zero below this Layer (Extra Process)")
4705
+ with gr.Tab("Sliding Window", visible= sliding_window_enabled) as sliding_window_tab:
 
4706
 
4707
  with gr.Column():
4708
  gr.Markdown("<B>A Sliding Window allows you to generate video with a duration not limited by the Model</B>")
 
4714
  sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, visible = False)
4715
  elif ltxv:
4716
  sliding_window_size = gr.Slider(41, 257, value=ui_defaults.get("sliding_window_size", 129), step=8, label="Sliding Window Size")
4717
+ sliding_window_overlap = gr.Slider(9, 97, value=ui_defaults.get("sliding_window_overlap",9), step=8, label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)")
4718
  sliding_window_overlap_noise = gr.Slider(0, 100, value=ui_defaults.get("sliding_window_overlap_noise",20), step=1, label="Noise to be added to overlapped frames to reduce blur effect")
4719
  sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, visible = False)
4720
  else:
 
5172
  mixed_precision_choice = gr.Dropdown(
5173
  choices=[
5174
  ("16 bits only, requires less VRAM", "0"),
5175
+ ("Mixed 16 / 32 bits, slightly more VRAM needed but better Quality mainly for 1.3B models", "1"),
5176
  ],
5177
  value= server_config.get("mixed_precision", "0"),
5178
  label="Transformer Engine Calculation"
 
5650
  else:
5651
  theme = gr.themes.Soft(font=["Verdana"], primary_hue="sky", neutral_hue="slate", text_size="md")
5652
 
5653
+ with gr.Blocks(css=css, theme=theme, title= "WanGP") as main:
5654
  gr.Markdown("<div align=center><H1>Wan<SUP>GP</SUP> v5.1 <FONT SIZE=4>by <I>DeepBeepMeep</I></FONT> <FONT SIZE=3>") # (<A HREF='https://github.com/deepbeepmeep/Wan2GP'>Updates</A>)</FONT SIZE=3></H1></div>")
5655
  global model_list
5656