Fabrice-TIERCELIN commited on
Commit
4dab183
·
verified ·
1 Parent(s): 9306ea0

This Pull Request also extends a video & optimizes time & VRAM

Browse files

This PR extends a video, optimizes time & VRAM, displays generation time, chooses resolution and adds examples. It removes the inpaint that does not work.

Click on _Merge_ to add those features.

Files changed (1) hide show
  1. app.py +1041 -257
app.py CHANGED
@@ -4,14 +4,29 @@ import os
4
 
5
  os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
 
 
7
  import gradio as gr
8
  import torch
9
  import traceback
10
  import einops
11
  import safetensors.torch as sf
12
  import numpy as np
 
 
13
  import math
14
- import spaces
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  from PIL import Image
17
  from diffusers import AutoencoderKLHunyuanVideo
@@ -20,128 +35,293 @@ from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode
20
  from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
21
  from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
22
  from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
23
- from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
 
24
  from diffusers_helper.thread_utils import AsyncStream, async_run
25
  from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
26
  from transformers import SiglipImageProcessor, SiglipVisionModel
27
  from diffusers_helper.clip_vision import hf_clip_vision_encode
28
  from diffusers_helper.bucket_tools import find_nearest_bucket
 
 
29
 
 
30
 
31
- free_mem_gb = get_cuda_free_memory_gb(gpu)
32
- high_vram = free_mem_gb > 60
33
-
34
- print(f'Free VRAM {free_mem_gb} GB')
35
- print(f'High-VRAM Mode: {high_vram}')
36
-
37
- text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
38
- text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
39
- tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
40
- tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
41
- vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
42
-
43
- feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
44
- image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
45
-
46
- transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
47
-
48
- vae.eval()
49
- text_encoder.eval()
50
- text_encoder_2.eval()
51
- image_encoder.eval()
52
- transformer.eval()
53
-
54
- if not high_vram:
55
- vae.enable_slicing()
56
- vae.enable_tiling()
57
-
58
- transformer.high_quality_fp32_output_for_inference = True
59
- print('transformer.high_quality_fp32_output_for_inference = True')
60
-
61
- transformer.to(dtype=torch.bfloat16)
62
- vae.to(dtype=torch.float16)
63
- image_encoder.to(dtype=torch.float16)
64
- text_encoder.to(dtype=torch.float16)
65
- text_encoder_2.to(dtype=torch.float16)
66
 
67
- vae.requires_grad_(False)
68
- text_encoder.requires_grad_(False)
69
- text_encoder_2.requires_grad_(False)
70
- image_encoder.requires_grad_(False)
71
- transformer.requires_grad_(False)
72
 
73
- if not high_vram:
74
- # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
75
- DynamicSwapInstaller.install_model(transformer, device=gpu)
76
- DynamicSwapInstaller.install_model(text_encoder, device=gpu)
77
- else:
78
- text_encoder.to(gpu)
79
- text_encoder_2.to(gpu)
80
- image_encoder.to(gpu)
81
- vae.to(gpu)
82
- transformer.to(gpu)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  stream = AsyncStream()
85
 
86
  outputs_folder = './outputs/'
87
  os.makedirs(outputs_folder, exist_ok=True)
88
 
89
- examples = [
90
- ["img_examples/1.png", "The girl dances gracefully, with clear movements, full of charm.",],
91
- ["img_examples/2.jpg", "The man dances flamboyantly, swinging his hips and striking bold poses with dramatic flair."],
92
- ["img_examples/3.png", "The woman dances elegantly among the blossoms, spinning slowly with flowing sleeves and graceful hand movements."],
93
- ]
94
-
95
- def generate_examples(input_image, prompt):
96
-
97
- t2v=False
98
- n_prompt=""
99
- seed=31337
100
- total_second_length=5
101
- latent_window_size=9
102
- steps=25
103
- cfg=1.0
104
- gs=10.0
105
- rs=0.0
106
- gpu_memory_preservation=6
107
- use_teacache=True
108
- mp4_crf=16
109
-
110
- global stream
111
-
112
- # assert input_image is not None, 'No input image!'
113
- if t2v:
114
- default_height, default_width = 640, 640
115
- input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
116
- print("No input image provided. Using a blank white image.")
117
 
118
- yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
119
-
120
- stream = AsyncStream()
121
-
122
- async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
123
-
124
- output_filename = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- while True:
127
- flag, data = stream.output_queue.next()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
- if flag == 'file':
130
- output_filename = data
131
- yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
 
132
 
133
- if flag == 'progress':
134
- preview, desc, html = data
135
- yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
 
136
 
137
- if flag == 'end':
138
- yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
139
- break
140
 
 
 
 
 
 
141
 
142
-
143
- @torch.no_grad()
144
- def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
145
  total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
146
  total_latent_sections = int(max(round(total_latent_sections), 1))
147
 
@@ -164,22 +344,17 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
164
  fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
165
  load_model_as_complete(text_encoder_2, target_device=gpu)
166
 
167
- llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
168
-
169
- if cfg == 1:
170
- llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
171
- else:
172
- llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
173
 
174
- llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
175
- llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
176
 
177
  # Processing input image
178
 
179
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
180
 
181
  H, W, C = input_image.shape
182
- height, width = find_nearest_bucket(H, W, resolution=640)
183
  input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
184
 
185
  Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
@@ -208,10 +383,6 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
208
 
209
  # Dtype
210
 
211
- llama_vec = llama_vec.to(transformer.dtype)
212
- llama_vec_n = llama_vec_n.to(transformer.dtype)
213
- clip_l_pooler = clip_l_pooler.to(transformer.dtype)
214
- clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
215
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
216
 
217
  # Sampling
@@ -226,6 +397,63 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
226
  history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
227
  total_generated_latent_frames = 1
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  for section_index in range(total_latent_sections):
230
  if stream.input_queue.top() == 'end':
231
  stream.output_queue.push(('end', None))
@@ -233,6 +461,9 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
233
 
234
  print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
235
 
 
 
 
236
  if not high_vram:
237
  unload_complete_models()
238
  move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
@@ -242,28 +473,6 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
242
  else:
243
  transformer.initialize_teacache(enable_teacache=False)
244
 
245
- def callback(d):
246
- preview = d['denoised']
247
- preview = vae_decode_fake(preview)
248
-
249
- preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
250
- preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
251
-
252
- if stream.input_queue.top() == 'end':
253
- stream.output_queue.push(('end', None))
254
- raise KeyboardInterrupt('User ends the task.')
255
-
256
- current_step = d['i'] + 1
257
- percentage = int(100.0 * current_step / steps)
258
- hint = f'Sampling {current_step}/{steps}'
259
- desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
260
- stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
261
- return
262
-
263
- indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
264
- clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
265
- clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
266
-
267
  clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
268
  clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
269
 
@@ -298,34 +507,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
298
  callback=callback,
299
  )
300
 
301
- total_generated_latent_frames += int(generated_latents.shape[2])
302
- history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
303
-
304
- if not high_vram:
305
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
306
- load_model_as_complete(vae, target_device=gpu)
307
-
308
- real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
309
-
310
- if history_pixels is None:
311
- history_pixels = vae_decode(real_history_latents, vae).cpu()
312
- else:
313
- section_latent_frames = latent_window_size * 2
314
- overlapped_frames = latent_window_size * 4 - 3
315
-
316
- current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
317
- history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
318
-
319
- if not high_vram:
320
- unload_complete_models()
321
-
322
- output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
323
-
324
- save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
325
-
326
- print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
327
-
328
- stream.output_queue.push(('file', output_filename))
329
  except:
330
  traceback.print_exc()
331
 
@@ -337,62 +519,51 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
337
  stream.output_queue.push(('end', None))
338
  return
339
 
340
- def get_duration(input_image, prompt, t2v, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
341
- return total_second_length * 60
342
 
343
  @spaces.GPU(duration=get_duration)
344
- def process(input_image, prompt,
345
- t2v=False,
346
- n_prompt="",
347
- seed=31337,
348
- total_second_length=5,
349
- latent_window_size=9,
350
- steps=25,
351
- cfg=1.0,
352
- gs=10.0,
353
- rs=0.0,
354
- gpu_memory_preservation=6,
355
- use_teacache=True,
 
 
 
356
  mp4_crf=16
357
  ):
 
358
  global stream
359
-
 
 
 
 
 
 
 
 
 
 
360
  # assert input_image is not None, 'No input image!'
361
- if t2v:
362
  default_height, default_width = 640, 640
363
  input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
364
  print("No input image provided. Using a blank white image.")
365
- else:
366
- composite_rgba_uint8 = input_image["composite"]
367
 
368
- # rgb_uint8 will be (H, W, 3), dtype uint8
369
- rgb_uint8 = composite_rgba_uint8[:, :, :3]
370
- # mask_uint8 will be (H, W), dtype uint8
371
- mask_uint8 = composite_rgba_uint8[:, :, 3]
372
-
373
- # Create background
374
- h, w = rgb_uint8.shape[:2]
375
- # White background, (H, W, 3), dtype uint8
376
- background_uint8 = np.full((h, w, 3), 255, dtype=np.uint8)
377
-
378
- # Normalize mask to range [0.0, 1.0].
379
- alpha_normalized_float32 = mask_uint8.astype(np.float32) / 255.0
380
-
381
- # Expand alpha to 3 channels to match RGB images for broadcasting.
382
- # alpha_mask_float32 will have shape (H, W, 3)
383
- alpha_mask_float32 = np.stack([alpha_normalized_float32] * 3, axis=2)
384
-
385
- # alpha blending
386
- blended_image_float32 = rgb_uint8.astype(np.float32) * alpha_mask_float32 + \
387
- background_uint8.astype(np.float32) * (1.0 - alpha_mask_float32)
388
-
389
- input_image = np.clip(blended_image_float32, 0, 255).astype(np.uint8)
390
-
391
  yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
392
 
393
  stream = AsyncStream()
394
 
395
- async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
396
 
397
  output_filename = None
398
 
@@ -408,61 +579,495 @@ def process(input_image, prompt,
408
  yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
409
 
410
  if flag == 'end':
411
- yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
 
 
 
 
 
 
 
 
 
 
412
  break
413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
  def end_process():
416
  stream.input_queue.push('end')
417
 
 
 
418
 
419
- quick_prompts = [
420
- 'The girl dances gracefully, with clear movements, full of charm.',
421
- 'A character doing some simple body movements.',
422
- ]
423
- quick_prompts = [[x] for x in quick_prompts]
424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
 
426
  css = make_progress_bar_css()
427
- block = gr.Blocks(css=css).queue()
428
  with block:
429
- gr.Markdown('# FramePack-F1')
430
- gr.Markdown(f"""### Video diffusion, but feels like image diffusion
431
- *FramePack F1 - a FramePack model that only predicts future frames from history frames*
432
- ### *beta* FramePack Fill 🖋️- draw a mask over the input image to inpaint the video output
433
- adapted from the officical code repo [FramePack](https://github.com/lllyasviel/FramePack) by [lllyasviel](lllyasviel/FramePack_F1_I2V_HY_20250503) and [FramePack Studio](https://github.com/colinurbs/FramePack-Studio) 🙌🏻
 
 
434
  """)
 
 
435
  with gr.Row():
436
  with gr.Column():
437
- input_image = gr.ImageEditor(type="numpy", label="Image", height=320)
438
- prompt = gr.Textbox(label="Prompt", value='')
439
- t2v = gr.Checkbox(label="do text-to-video", value=False)
440
- example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt])
441
- example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False)
 
 
 
 
 
 
 
 
 
 
 
 
442
 
443
  with gr.Row():
444
- start_button = gr.Button(value="Start Generation")
445
- end_button = gr.Button(value="End Generation", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
 
447
- total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=5, value=2, step=0.1)
448
- with gr.Group():
449
- with gr.Accordion("Advanced settings", open=False):
450
- use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
451
-
452
- n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False) # Not used
453
- seed = gr.Number(label="Seed", value=31337, precision=0)
454
-
455
-
456
- latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False) # Should not change
457
- steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
458
-
459
- cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False) # Should not change
460
- gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.')
461
- rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
462
-
463
- gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
464
-
465
- mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
466
 
467
  with gr.Column():
468
  preview_image = gr.Image(label="Next Latents", height=200, visible=False)
@@ -470,19 +1075,198 @@ adapted from the officical code repo [FramePack](https://github.com/lllyasviel/F
470
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
471
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
472
 
473
- gr.HTML('<div style="text-align:center; margin-top:20px;">Share your results and find ideas at the <a href="https://x.com/search?q=framepack&f=live" target="_blank">FramePack Twitter (X) thread</a></div>')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
 
475
- ips = [input_image, prompt, t2v, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
476
- start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
 
 
 
 
 
 
 
477
  end_button.click(fn=end_process)
478
 
479
- # gr.Examples(
480
- # examples,
481
- # inputs=[input_image, prompt],
482
- # outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
483
- # fn=generate_examples,
484
- # cache_examples=True
485
- # )
486
-
487
-
488
- block.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
 
7
+ import spaces
8
  import gradio as gr
9
  import torch
10
  import traceback
11
  import einops
12
  import safetensors.torch as sf
13
  import numpy as np
14
+ import random
15
+ import time
16
  import math
17
+ # 20250506 pftq: Added for video input loading
18
+ import decord
19
+ # 20250506 pftq: Added for progress bars in video_encode
20
+ from tqdm import tqdm
21
+ # 20250506 pftq: Normalize file paths for Windows compatibility
22
+ import pathlib
23
+ # 20250506 pftq: for easier to read timestamp
24
+ from datetime import datetime
25
+ # 20250508 pftq: for saving prompt to mp4 comments metadata
26
+ import imageio_ffmpeg
27
+ import tempfile
28
+ import shutil
29
+ import subprocess
30
 
31
  from PIL import Image
32
  from diffusers import AutoencoderKLHunyuanVideo
 
35
  from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
36
  from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
37
  from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
38
+ if torch.cuda.device_count() > 0:
39
+ from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
40
  from diffusers_helper.thread_utils import AsyncStream, async_run
41
  from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
42
  from transformers import SiglipImageProcessor, SiglipVisionModel
43
  from diffusers_helper.clip_vision import hf_clip_vision_encode
44
  from diffusers_helper.bucket_tools import find_nearest_bucket
45
+ from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
46
+ import pillow_heif
47
 
48
+ pillow_heif.register_heif_opener()
49
 
50
+ high_vram = False
51
+ free_mem_gb = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ if torch.cuda.device_count() > 0:
54
+ free_mem_gb = get_cuda_free_memory_gb(gpu)
55
+ high_vram = free_mem_gb > 60
 
 
56
 
57
+ print(f'Free VRAM {free_mem_gb} GB')
58
+ print(f'High-VRAM Mode: {high_vram}')
59
+
60
+ text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
61
+ text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
62
+ tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
63
+ tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
64
+ vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
65
+
66
+ feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
67
+ image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
68
+
69
+ transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
70
+
71
+ vae.eval()
72
+ text_encoder.eval()
73
+ text_encoder_2.eval()
74
+ image_encoder.eval()
75
+ transformer.eval()
76
+
77
+ if not high_vram:
78
+ vae.enable_slicing()
79
+ vae.enable_tiling()
80
+
81
+ transformer.high_quality_fp32_output_for_inference = True
82
+ print('transformer.high_quality_fp32_output_for_inference = True')
83
+
84
+ transformer.to(dtype=torch.bfloat16)
85
+ vae.to(dtype=torch.float16)
86
+ image_encoder.to(dtype=torch.float16)
87
+ text_encoder.to(dtype=torch.float16)
88
+ text_encoder_2.to(dtype=torch.float16)
89
+
90
+ vae.requires_grad_(False)
91
+ text_encoder.requires_grad_(False)
92
+ text_encoder_2.requires_grad_(False)
93
+ image_encoder.requires_grad_(False)
94
+ transformer.requires_grad_(False)
95
+
96
+ if not high_vram:
97
+ # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
98
+ DynamicSwapInstaller.install_model(transformer, device=gpu)
99
+ DynamicSwapInstaller.install_model(text_encoder, device=gpu)
100
+ else:
101
+ text_encoder.to(gpu)
102
+ text_encoder_2.to(gpu)
103
+ image_encoder.to(gpu)
104
+ vae.to(gpu)
105
+ transformer.to(gpu)
106
 
107
  stream = AsyncStream()
108
 
109
  outputs_folder = './outputs/'
110
  os.makedirs(outputs_folder, exist_ok=True)
111
 
112
+ default_local_storage = {
113
+ "generation-mode": "image",
114
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ @spaces.GPU()
117
+ @torch.no_grad()
118
+ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
119
+ """
120
+ Encode a video into latent representations using the VAE.
121
+
122
+ Args:
123
+ video_path: Path to the input video file.
124
+ vae: AutoencoderKLHunyuanVideo model.
125
+ height, width: Target resolution for resizing frames.
126
+ vae_batch_size: Number of frames to process per batch.
127
+ device: Device for computation (e.g., "cuda").
128
+
129
+ Returns:
130
+ start_latent: Latent of the first frame (for compatibility with original code).
131
+ input_image_np: First frame as numpy array (for CLIP vision encoding).
132
+ history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
133
+ fps: Frames per second of the input video.
134
+ """
135
+ # 20250506 pftq: Normalize video path for Windows compatibility
136
+ video_path = str(pathlib.Path(video_path).resolve())
137
+ print(f"Processing video: {video_path}")
138
+
139
+ # 20250506 pftq: Check CUDA availability and fallback to CPU if needed
140
+ if device == "cuda" and not torch.cuda.is_available():
141
+ print("CUDA is not available, falling back to CPU")
142
+ device = "cpu"
143
 
144
+ try:
145
+ # 20250506 pftq: Load video and get FPS
146
+ print("Initializing VideoReader...")
147
+ vr = decord.VideoReader(video_path)
148
+ fps = vr.get_avg_fps() # Get input video FPS
149
+ num_real_frames = len(vr)
150
+ print(f"Video loaded: {num_real_frames} frames, FPS: {fps}")
151
+
152
+ # Truncate to nearest latent size (multiple of 4)
153
+ latent_size_factor = 4
154
+ num_frames = (num_real_frames // latent_size_factor) * latent_size_factor
155
+ if num_frames != num_real_frames:
156
+ print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
157
+ num_real_frames = num_frames
158
+
159
+ # 20250506 pftq: Read frames
160
+ print("Reading video frames...")
161
+ frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
162
+ print(f"Frames read: {frames.shape}")
163
+
164
+ # 20250506 pftq: Get native video resolution
165
+ native_height, native_width = frames.shape[1], frames.shape[2]
166
+ print(f"Native video resolution: {native_width}x{native_height}")
167
+
168
+ # 20250506 pftq: Use native resolution if height/width not specified, otherwise use provided values
169
+ target_height = native_height if height is None else height
170
+ target_width = native_width if width is None else width
171
+
172
+ # 20250506 pftq: Adjust to nearest bucket for model compatibility
173
+ if not no_resize:
174
+ target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
175
+ print(f"Adjusted resolution: {target_width}x{target_height}")
176
+ else:
177
+ print(f"Using native resolution without resizing: {target_width}x{target_height}")
178
+
179
+ # 20250506 pftq: Preprocess frames to match original image processing
180
+ processed_frames = []
181
+ for i, frame in enumerate(frames):
182
+ #print(f"Preprocessing frame {i+1}/{num_frames}")
183
+ frame_np = resize_and_center_crop(frame, target_width=target_width, target_height=target_height)
184
+ processed_frames.append(frame_np)
185
+ processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
186
+ print(f"Frames preprocessed: {processed_frames.shape}")
187
+
188
+ # 20250506 pftq: Save first frame for CLIP vision encoding
189
+ input_image_np = processed_frames[0]
190
+
191
+ # 20250506 pftq: Convert to tensor and normalize to [-1, 1]
192
+ print("Converting frames to tensor...")
193
+ frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
194
+ frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
195
+ frames_pt = frames_pt.unsqueeze(0) # Shape: (1, num_real_frames, channels, height, width)
196
+ frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
197
+ print(f"Tensor shape: {frames_pt.shape}")
198
+
199
+ # 20250507 pftq: Save pixel frames for use in worker
200
+ input_video_pixels = frames_pt.cpu()
201
+
202
+ # 20250506 pftq: Move to device
203
+ print(f"Moving tensor to device: {device}")
204
+ frames_pt = frames_pt.to(device)
205
+ print("Tensor moved to device")
206
+
207
+ # 20250506 pftq: Move VAE to device
208
+ print(f"Moving VAE to device: {device}")
209
+ vae.to(device)
210
+ print("VAE moved to device")
211
+
212
+ # 20250506 pftq: Encode frames in batches
213
+ print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
214
+ latents = []
215
+ vae.eval()
216
+ with torch.no_grad():
217
+ for i in tqdm(range(0, frames_pt.shape[2], vae_batch_size), desc="Encoding video frames", mininterval=0.1):
218
+ #print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
219
+ batch = frames_pt[:, :, i:i + vae_batch_size] # Shape: (1, channels, batch_size, height, width)
220
+ try:
221
+ # 20250506 pftq: Log GPU memory before encoding
222
+ if device == "cuda":
223
+ free_mem = torch.cuda.memory_allocated() / 1024**3
224
+ #print(f"GPU memory before encoding: {free_mem:.2f} GB")
225
+ batch_latent = vae_encode(batch, vae)
226
+ # 20250506 pftq: Synchronize CUDA to catch issues
227
+ if device == "cuda":
228
+ torch.cuda.synchronize()
229
+ #print(f"GPU memory after encoding: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
230
+ latents.append(batch_latent)
231
+ #print(f"Batch encoded, latent shape: {batch_latent.shape}")
232
+ except RuntimeError as e:
233
+ print(f"Error during VAE encoding: {str(e)}")
234
+ if device == "cuda" and "out of memory" in str(e).lower():
235
+ print("CUDA out of memory, try reducing vae_batch_size or using CPU")
236
+ raise
237
+
238
+ # 20250506 pftq: Concatenate latents
239
+ print("Concatenating latents...")
240
+ history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
241
+ print(f"History latents shape: {history_latents.shape}")
242
+
243
+ # 20250506 pftq: Get first frame's latent
244
+ start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
245
+ print(f"Start latent shape: {start_latent.shape}")
246
+
247
+ # 20250506 pftq: Move VAE back to CPU to free GPU memory
248
+ if device == "cuda":
249
+ vae.to(cpu)
250
+ torch.cuda.empty_cache()
251
+ print("VAE moved back to CPU, CUDA cache cleared")
252
+
253
+ return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels
254
+
255
+ except Exception as e:
256
+ print(f"Error in video_encode: {str(e)}")
257
+ raise
258
+
259
+ # 20250508 pftq: for saving prompt to mp4 metadata comments
260
+ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
261
+ try:
262
+ # Get the path to the bundled FFmpeg binary from imageio-ffmpeg
263
+ ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
264
+
265
+ # Check if input file exists
266
+ if not os.path.exists(input_file):
267
+ print(f"Error: Input file {input_file} does not exist")
268
+ return False
269
+
270
+ # Create a temporary file path
271
+ temp_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name
272
+
273
+ # FFmpeg command using the bundled binary
274
+ command = [
275
+ ffmpeg_path, # Use imageio-ffmpeg's FFmpeg
276
+ '-i', input_file, # input file
277
+ '-metadata', f'comment={comments}', # set comment metadata
278
+ '-c:v', 'copy', # copy video stream without re-encoding
279
+ '-c:a', 'copy', # copy audio stream without re-encoding
280
+ '-y', # overwrite output file if it exists
281
+ temp_file # temporary output file
282
+ ]
283
+
284
+ # Run the FFmpeg command
285
+ result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
286
+
287
+ if result.returncode == 0:
288
+ # Replace the original file with the modified one
289
+ shutil.move(temp_file, input_file)
290
+ print(f"Successfully added comments to {input_file}")
291
+ return True
292
+ else:
293
+ # Clean up temp file if FFmpeg fails
294
+ if os.path.exists(temp_file):
295
+ os.remove(temp_file)
296
+ print(f"Error: FFmpeg failed with message:\n{result.stderr}")
297
+ return False
298
+
299
+ except Exception as e:
300
+ # Clean up temp file in case of other errors
301
+ if 'temp_file' in locals() and os.path.exists(temp_file):
302
+ os.remove(temp_file)
303
+ print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
304
+ return False
305
 
306
+ @torch.no_grad()
307
+ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
308
+ def encode_prompt(prompt, n_prompt):
309
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
310
 
311
+ if cfg == 1:
312
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
313
+ else:
314
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
315
 
316
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
317
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
 
318
 
319
+ llama_vec = llama_vec.to(transformer.dtype)
320
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
321
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
322
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
323
+ return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
324
 
 
 
 
325
  total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
326
  total_latent_sections = int(max(round(total_latent_sections), 1))
327
 
 
344
  fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
345
  load_model_as_complete(text_encoder_2, target_device=gpu)
346
 
347
+ prompt_parameters = []
 
 
 
 
 
348
 
349
+ for prompt_part in prompts:
350
+ prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
351
 
352
  # Processing input image
353
 
354
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
355
 
356
  H, W, C = input_image.shape
357
+ height, width = find_nearest_bucket(H, W, resolution=resolution)
358
  input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
359
 
360
  Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
 
383
 
384
  # Dtype
385
 
 
 
 
 
386
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
387
 
388
  # Sampling
 
397
  history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
398
  total_generated_latent_frames = 1
399
 
400
+ if enable_preview:
401
+ def callback(d):
402
+ preview = d['denoised']
403
+ preview = vae_decode_fake(preview)
404
+
405
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
406
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
407
+
408
+ if stream.input_queue.top() == 'end':
409
+ stream.output_queue.push(('end', None))
410
+ raise KeyboardInterrupt('User ends the task.')
411
+
412
+ current_step = d['i'] + 1
413
+ percentage = int(100.0 * current_step / steps)
414
+ hint = f'Sampling {current_step}/{steps}'
415
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
416
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
417
+ return
418
+ else:
419
+ def callback(d):
420
+ return
421
+
422
+ indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
423
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
424
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
425
+
426
+ def post_process(generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
427
+ total_generated_latent_frames += int(generated_latents.shape[2])
428
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
429
+
430
+ if not high_vram:
431
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
432
+ load_model_as_complete(vae, target_device=gpu)
433
+
434
+ if history_pixels is None:
435
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
436
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
437
+ else:
438
+ section_latent_frames = latent_window_size * 2
439
+ overlapped_frames = latent_window_size * 4 - 3
440
+
441
+ real_history_latents = history_latents[:, :, max(-section_latent_frames, -total_generated_latent_frames):, :, :]
442
+ history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
443
+
444
+ if not high_vram:
445
+ unload_complete_models()
446
+
447
+ if enable_preview or section_index == total_latent_sections - 1:
448
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
449
+
450
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
451
+
452
+ print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
453
+
454
+ stream.output_queue.push(('file', output_filename))
455
+ return [total_generated_latent_frames, history_latents, history_pixels]
456
+
457
  for section_index in range(total_latent_sections):
458
  if stream.input_queue.top() == 'end':
459
  stream.output_queue.push(('end', None))
 
461
 
462
  print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
463
 
464
+ if len(prompt_parameters) > 0:
465
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(0)
466
+
467
  if not high_vram:
468
  unload_complete_models()
469
  move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
 
473
  else:
474
  transformer.initialize_teacache(enable_teacache=False)
475
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
476
  clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
477
  clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
478
 
 
507
  callback=callback,
508
  )
509
 
510
+ [total_generated_latent_frames, history_latents, history_pixels] = post_process(generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  except:
512
  traceback.print_exc()
513
 
 
519
  stream.output_queue.push(('end', None))
520
  return
521
 
522
+ def get_duration(input_image, prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
523
+ return total_second_length * 60 * (0.9 if use_teacache else 1.5) * (1 + ((steps - 25) / 100))
524
 
525
  @spaces.GPU(duration=get_duration)
526
+ def process(input_image, prompt,
527
+ generation_mode="image",
528
+ n_prompt="",
529
+ randomize_seed=True,
530
+ seed=31337,
531
+ resolution=640,
532
+ total_second_length=5,
533
+ latent_window_size=9,
534
+ steps=25,
535
+ cfg=1.0,
536
+ gs=10.0,
537
+ rs=0.0,
538
+ gpu_memory_preservation=6,
539
+ enable_preview=True,
540
+ use_teacache=False,
541
  mp4_crf=16
542
  ):
543
+ start = time.time()
544
  global stream
545
+
546
+ if torch.cuda.device_count() == 0:
547
+ gr.Warning('Set this space to GPU config to make it work.')
548
+ yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
549
+ return
550
+
551
+ if randomize_seed:
552
+ seed = random.randint(0, np.iinfo(np.int32).max)
553
+
554
+ prompts = prompt.split(";")
555
+
556
  # assert input_image is not None, 'No input image!'
557
+ if generation_mode == "text":
558
  default_height, default_width = 640, 640
559
  input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
560
  print("No input image provided. Using a blank white image.")
 
 
561
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
  yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
563
 
564
  stream = AsyncStream()
565
 
566
+ async_run(worker, input_image, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf)
567
 
568
  output_filename = None
569
 
 
579
  yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
580
 
581
  if flag == 'end':
582
+ end = time.time()
583
+ secondes = int(end - start)
584
+ minutes = math.floor(secondes / 60)
585
+ secondes = secondes - (minutes * 60)
586
+ hours = math.floor(minutes / 60)
587
+ minutes = minutes - (hours * 60)
588
+ yield output_filename, gr.update(visible=False), gr.update(), "The video has been generated in " + \
589
+ ((str(hours) + " h, ") if hours != 0 else "") + \
590
+ ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
591
+ str(secondes) + " sec. " + \
592
+ "You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character.", gr.update(interactive=True), gr.update(interactive=False)
593
  break
594
 
595
+ # 20250506 pftq: Modified worker to accept video input and clean frame count
596
+ @spaces.GPU()
597
+ @torch.no_grad()
598
+ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
599
+ def encode_prompt(prompt, n_prompt):
600
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
601
+
602
+ if cfg == 1:
603
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
604
+ else:
605
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
606
+
607
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
608
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
609
+
610
+ llama_vec = llama_vec.to(transformer.dtype)
611
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
612
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
613
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
614
+ return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
615
+
616
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
617
+
618
+ try:
619
+ # Clean GPU
620
+ if not high_vram:
621
+ unload_complete_models(
622
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
623
+ )
624
+
625
+ # Text encoding
626
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
627
+
628
+ if not high_vram:
629
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
630
+ load_model_as_complete(text_encoder_2, target_device=gpu)
631
+
632
+ prompt_parameters = []
633
+
634
+ for prompt_part in prompts:
635
+ prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
636
+
637
+ # 20250506 pftq: Processing input video instead of image
638
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
639
+
640
+ # 20250506 pftq: Encode video
641
+ start_latent, input_image_np, video_latents, fps, height, width, input_video_pixels = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
642
+
643
+ # CLIP Vision
644
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
645
+
646
+ if not high_vram:
647
+ load_model_as_complete(image_encoder, target_device=gpu)
648
+
649
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
650
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
651
+
652
+ # Dtype
653
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
654
+
655
+ total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
656
+ total_latent_sections = int(max(round(total_latent_sections), 1))
657
+
658
+ if enable_preview:
659
+ def callback(d):
660
+ preview = d['denoised']
661
+ preview = vae_decode_fake(preview)
662
+
663
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
664
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
665
+
666
+ if stream.input_queue.top() == 'end':
667
+ stream.output_queue.push(('end', None))
668
+ raise KeyboardInterrupt('User ends the task.')
669
+
670
+ current_step = d['i'] + 1
671
+ percentage = int(100.0 * current_step / steps)
672
+ hint = f'Sampling {current_step}/{steps}'
673
+ desc = f'Total frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps) :.2f} seconds (FPS-{fps}), Resolution: {height}px * {width}px, Seed: {seed}, Video {idx+1} of {batch}. The video is generating part {section_index+1} of {total_latent_sections}...'
674
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
675
+ return
676
+ else:
677
+ def callback(d):
678
+ return
679
+
680
+ def compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent):
681
+ # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
682
+ available_frames = history_latents.shape[2] # Number of latent frames
683
+ max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
684
+ adjusted_latent_frames = max(1, (max_pixel_frames + 3) // 4) # Convert back to latent frames
685
+ # Adjust num_clean_frames to match original behavior: num_clean_frames=2 means 1 frame for clean_latents_1x
686
+ effective_clean_frames = max(0, num_clean_frames - 1)
687
+ effective_clean_frames = min(effective_clean_frames, available_frames - 2) if available_frames > 2 else 0 # 20250507 pftq: changed 1 to 2 for edge case for <=1 sec videos
688
+ num_2x_frames = min(2, max(1, available_frames - effective_clean_frames - 1)) if available_frames > effective_clean_frames + 1 else 0 # 20250507 pftq: subtracted 1 for edge case for <=1 sec videos
689
+ num_4x_frames = min(16, max(1, available_frames - effective_clean_frames - num_2x_frames)) if available_frames > effective_clean_frames + num_2x_frames else 0 # 20250507 pftq: Edge case for <=1 sec
690
+
691
+ total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
692
+ total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
693
+
694
+ indices = torch.arange(0, sum([1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames])).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
695
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
696
+ [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
697
+ )
698
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
699
+
700
+ # 20250506 pftq: Split history_latents dynamically based on available frames
701
+ fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
702
+ context_frames = clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
703
+
704
+ if total_context_frames > 0:
705
+ context_frames = history_latents[:, :, -total_context_frames:, :, :]
706
+ split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
707
+ split_sizes = [s for s in split_sizes if s > 0] # Remove zero sizes
708
+ if split_sizes:
709
+ splits = context_frames.split(split_sizes, dim=2)
710
+ split_idx = 0
711
+
712
+ if num_4x_frames > 0:
713
+ clean_latents_4x = splits[split_idx]
714
+ split_idx = 1
715
+ if clean_latents_4x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
716
+ print("Edge case for <=1 sec videos 4x")
717
+ clean_latents_4x = clean_latents_4x.expand(-1, -1, 2, -1, -1)
718
+
719
+ if num_2x_frames > 0 and split_idx < len(splits):
720
+ clean_latents_2x = splits[split_idx]
721
+ if clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
722
+ print("Edge case for <=1 sec videos 2x")
723
+ clean_latents_2x = clean_latents_2x.expand(-1, -1, 2, -1, -1)
724
+ split_idx += 1
725
+ elif clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
726
+ clean_latents_2x = clean_latents_4x
727
+
728
+ if effective_clean_frames > 0 and split_idx < len(splits):
729
+ clean_latents_1x = splits[split_idx]
730
+
731
+ clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
732
+
733
+ # 20250507 pftq: Fix for <=1 sec videos.
734
+ max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
735
+ return [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices]
736
+
737
+ for idx in range(batch):
738
+ if batch > 1:
739
+ print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
740
+
741
+ #job_id = generate_timestamp()
742
+ job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1-videoinput_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}" # 20250506 pftq: easier to read timestamp and filename
743
+
744
+ # Sampling
745
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
746
+
747
+ rnd = torch.Generator("cpu").manual_seed(seed)
748
+
749
+ # 20250506 pftq: Initialize history_latents with video latents
750
+ history_latents = video_latents.cpu()
751
+ total_generated_latent_frames = history_latents.shape[2]
752
+ # 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
753
+ history_pixels = None
754
+ previous_video = None
755
+
756
+ for section_index in range(total_latent_sections):
757
+ if stream.input_queue.top() == 'end':
758
+ stream.output_queue.push(('end', None))
759
+ return
760
+
761
+ print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
762
+
763
+ if len(prompt_parameters) > 0:
764
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(0)
765
+
766
+ if not high_vram:
767
+ unload_complete_models()
768
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
769
+
770
+ if use_teacache:
771
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
772
+ else:
773
+ transformer.initialize_teacache(enable_teacache=False)
774
+
775
+ [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices] = compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent)
776
+
777
+ generated_latents = sample_hunyuan(
778
+ transformer=transformer,
779
+ sampler='unipc',
780
+ width=width,
781
+ height=height,
782
+ frames=max_frames,
783
+ real_guidance_scale=cfg,
784
+ distilled_guidance_scale=gs,
785
+ guidance_rescale=rs,
786
+ num_inference_steps=steps,
787
+ generator=rnd,
788
+ prompt_embeds=llama_vec,
789
+ prompt_embeds_mask=llama_attention_mask,
790
+ prompt_poolers=clip_l_pooler,
791
+ negative_prompt_embeds=llama_vec_n,
792
+ negative_prompt_embeds_mask=llama_attention_mask_n,
793
+ negative_prompt_poolers=clip_l_pooler_n,
794
+ device=gpu,
795
+ dtype=torch.bfloat16,
796
+ image_embeddings=image_encoder_last_hidden_state,
797
+ latent_indices=latent_indices,
798
+ clean_latents=clean_latents,
799
+ clean_latent_indices=clean_latent_indices,
800
+ clean_latents_2x=clean_latents_2x,
801
+ clean_latent_2x_indices=clean_latent_2x_indices,
802
+ clean_latents_4x=clean_latents_4x,
803
+ clean_latent_4x_indices=clean_latent_4x_indices,
804
+ callback=callback,
805
+ )
806
+
807
+ total_generated_latent_frames += int(generated_latents.shape[2])
808
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
809
+
810
+ if not high_vram:
811
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
812
+ load_model_as_complete(vae, target_device=gpu)
813
+
814
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
815
+
816
+ if history_pixels is None:
817
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
818
+ else:
819
+ section_latent_frames = latent_window_size * 2
820
+ overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
821
+
822
+ history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu(), overlapped_frames)
823
+
824
+ if not high_vram:
825
+ unload_complete_models()
826
+
827
+ if enable_preview or section_index == total_latent_sections - 1:
828
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
829
+
830
+ # 20250506 pftq: Use input video FPS for output
831
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
832
+ print(f"Latest video saved: {output_filename}")
833
+ # 20250508 pftq: Save prompt to mp4 metadata comments
834
+ set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompts} | Negative Prompt: {n_prompt}");
835
+ print(f"Prompt saved to mp4 metadata comments: {output_filename}")
836
+
837
+ # 20250506 pftq: Clean up previous partial files
838
+ if previous_video is not None and os.path.exists(previous_video):
839
+ try:
840
+ os.remove(previous_video)
841
+ print(f"Previous partial video deleted: {previous_video}")
842
+ except Exception as e:
843
+ print(f"Error deleting previous partial video {previous_video}: {e}")
844
+ previous_video = output_filename
845
+
846
+ print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
847
+
848
+ stream.output_queue.push(('file', output_filename))
849
+
850
+ seed = (seed + 1) % np.iinfo(np.int32).max
851
+
852
+ except:
853
+ traceback.print_exc()
854
+
855
+ if not high_vram:
856
+ unload_complete_models(
857
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
858
+ )
859
+
860
+ stream.output_queue.push(('end', None))
861
+ return
862
+
863
+ def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
864
+ return total_second_length * 60 * (0.9 if use_teacache else 2.3) * (1 + ((steps - 25) / 100))
865
+
866
+ # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
867
+ @spaces.GPU(duration=get_duration_video)
868
+ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
869
+ start = time.time()
870
+ global stream, high_vram
871
+
872
+ if torch.cuda.device_count() == 0:
873
+ gr.Warning('Set this space to GPU config to make it work.')
874
+ yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
875
+ return
876
+
877
+ if randomize_seed:
878
+ seed = random.randint(0, np.iinfo(np.int32).max)
879
+
880
+ prompts = prompt.split(";")
881
+
882
+ # 20250506 pftq: Updated assertion for video input
883
+ assert input_video is not None, 'No input video!'
884
+
885
+ yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
886
+
887
+ # 20250507 pftq: Even the H100 needs offloading if the video dimensions are 720p or higher
888
+ if high_vram and (no_resize or resolution>640):
889
+ print("Disabling high vram mode due to no resize and/or potentially higher resolution...")
890
+ high_vram = False
891
+ vae.enable_slicing()
892
+ vae.enable_tiling()
893
+ DynamicSwapInstaller.install_model(transformer, device=gpu)
894
+ DynamicSwapInstaller.install_model(text_encoder, device=gpu)
895
+
896
+ # 20250508 pftq: automatically set distilled cfg to 1 if cfg is used
897
+ if cfg > 1:
898
+ gs = 1
899
+
900
+ stream = AsyncStream()
901
+
902
+ # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
903
+ async_run(worker_video, input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
904
+
905
+ output_filename = None
906
+
907
+ while True:
908
+ flag, data = stream.output_queue.next()
909
+
910
+ if flag == 'file':
911
+ output_filename = data
912
+ yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
913
+
914
+ if flag == 'progress':
915
+ preview, desc, html = data
916
+ #yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
917
+ yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
918
+
919
+ if flag == 'end':
920
+ end = time.time()
921
+ secondes = int(end - start)
922
+ minutes = math.floor(secondes / 60)
923
+ secondes = secondes - (minutes * 60)
924
+ hours = math.floor(minutes / 60)
925
+ minutes = minutes - (hours * 60)
926
+ yield output_filename, gr.update(visible=False), desc + \
927
+ " The video has been generated in " + \
928
+ ((str(hours) + " h, ") if hours != 0 else "") + \
929
+ ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
930
+ str(secondes) + " sec. " + \
931
+ " Video complete. You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character.", '', gr.update(interactive=True), gr.update(interactive=False)
932
+ break
933
 
934
  def end_process():
935
  stream.input_queue.push('end')
936
 
937
+ timeless_prompt_value = [""]
938
+ timed_prompts = {}
939
 
940
+ def handle_prompt_number_change():
941
+ timed_prompts.clear()
942
+ return []
 
 
943
 
944
+ def handle_timeless_prompt_change(timeless_prompt):
945
+ timeless_prompt_value[0] = timeless_prompt
946
+ return refresh_prompt()
947
+
948
+ def handle_timed_prompt_change(timed_prompt_id, timed_prompt):
949
+ timed_prompts[timed_prompt_id] = timed_prompt
950
+ return refresh_prompt()
951
+
952
+ def refresh_prompt():
953
+ dict_values = {k: v for k, v in timed_prompts.items()}
954
+ sorted_dict_values = sorted(dict_values.items(), key=lambda x: x[0])
955
+ array = []
956
+ for sorted_dict_value in sorted_dict_values:
957
+ if timeless_prompt_value[0] is not None and len(timeless_prompt_value[0]) and sorted_dict_value[1] is not None and len(sorted_dict_value[1]):
958
+ array.append(timeless_prompt_value[0] + ". " + sorted_dict_value[1])
959
+ else:
960
+ array.append(timeless_prompt_value[0] + sorted_dict_value[1])
961
+ print(str(array))
962
+ return ";".join(array)
963
+
964
+ title_html = """
965
+ <h1><center>FramePack</center></h1>
966
+ <big><center>Generate videos from text/image/video freely, without account, without watermark and download it</center></big>
967
+ <br/>
968
+
969
+ <p>This space is ready to work on ZeroGPU and GPU and has been tested successfully on ZeroGPU. Please leave a <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/FramePack/discussions/new">message in discussion</a> if you encounter issues.</p>
970
+ """
971
+
972
+ js = """
973
+ function createGradioAnimation() {
974
+ window.addEventListener("beforeunload", function (e) {
975
+ if (document.getElementById('end-button') && !document.getElementById('end-button').disabled) {
976
+ var confirmationMessage = 'A process is still running. '
977
+ + 'If you leave before saving, your changes will be lost.';
978
+
979
+ (e || window.event).returnValue = confirmationMessage;
980
+ }
981
+ return confirmationMessage;
982
+ });
983
+ return 'Animation created';
984
+ }
985
+ """
986
 
987
  css = make_progress_bar_css()
988
+ block = gr.Blocks(css=css, js=js).queue()
989
  with block:
990
+ if torch.cuda.device_count() == 0:
991
+ with gr.Row():
992
+ gr.HTML("""
993
+ <p style="background-color: red;"><big><big><big><b>⚠️To use FramePack, <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/FramePack?duplicate=true">duplicate this space</a> and set a GPU with 30 GB VRAM.</b>
994
+
995
+ You can't use FramePack directly here because this space runs on a CPU, which is not enough for FramePack. Please provide <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/FramePack/discussions/new">feedback</a> if you have issues.
996
+ </big></big></big></p>
997
  """)
998
+ gr.HTML(title_html)
999
+ local_storage = gr.BrowserState(default_local_storage)
1000
  with gr.Row():
1001
  with gr.Column():
1002
+ generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
1003
+ text_to_video_hint = gr.HTML("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1004
+ input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
1005
+ input_video = gr.Video(sources='upload', label="Input Video", height=320)
1006
+ timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
1007
+ prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Prompts will automatically appear')
1008
+
1009
+ @gr.render(inputs=prompt_number)
1010
+ def show_split(prompt_number):
1011
+ for digit in range(prompt_number):
1012
+ timed_prompt_id = gr.Textbox(value="timed_prompt_" + str(digit), visible=False)
1013
+ timed_prompt = gr.Textbox(label="Timed prompt #" + str(digit + 1), elem_id="timed_prompt_" + str(digit), value="")
1014
+ timed_prompt.change(fn=handle_timed_prompt_change, inputs=[timed_prompt_id, timed_prompt], outputs=[final_prompt])
1015
+
1016
+ final_prompt = gr.Textbox(label="Final prompt", value='', info='Use ; to separate in time')
1017
+ prompt_hint = gr.HTML("Video extension barely follows the prompt; to force to follow the prompt, you have to set the Distilled CFG Scale to 3.0 and the Context Frames to 2 but the video quality will be poor.")
1018
+ total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
1019
 
1020
  with gr.Row():
1021
+ start_button = gr.Button(value="🎥 Generate", variant="primary")
1022
+ start_button_video = gr.Button(value="🎥 Generate", variant="primary")
1023
+ end_button = gr.Button(elem_id="end-button", value="End Generation", variant="stop", interactive=False)
1024
+
1025
+ with gr.Accordion("Advanced settings", open=False):
1026
+ enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
1027
+ use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
1028
+
1029
+ n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1030
+
1031
+ latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
1032
+ steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG. If your animation has very few motion, you may have brutal brightness change; this can be fixed increasing the steps.')
1033
+
1034
+ with gr.Row():
1035
+ no_resize = gr.Checkbox(label='Force Original Video Resolution (no Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
1036
+ resolution = gr.Dropdown([
1037
+ ["409,600 px (working)", 640],
1038
+ ["451,584 px (working)", 672],
1039
+ ["495,616 px (VRAM pb on HF)", 704],
1040
+ ["589,824 px (not tested)", 768],
1041
+ ["692,224 px (not tested)", 832],
1042
+ ["746,496 px (not tested)", 864],
1043
+ ["921,600 px (not tested)", 960]
1044
+ ], value=672, label="Resolution (width x height)", info="Do not affect the generation time")
1045
+
1046
+ # 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
1047
+ cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time. Should not change.')
1048
+ gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Prompt adherence at the cost of less details from the input video, but to a lesser extent than Context Frames; 3=follow the prompt but blurred motions & unsharped, 10=focus motion; changing this value is not recommended')
1049
+ rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, info='Should not change')
1050
+
1051
+
1052
+ # 20250506 pftq: Renamed slider to Number of Context Frames and updated description
1053
+ num_clean_frames = gr.Slider(label="Number of Context Frames", minimum=2, maximum=10, value=5, step=1, info="Retain more video details but increase memory use. Reduce to 2 to avoid memory issues or to give more weight to the prompt.")
1054
+
1055
+ default_vae = 32
1056
+ if high_vram:
1057
+ default_vae = 128
1058
+ elif free_mem_gb>=20:
1059
+ default_vae = 64
1060
+
1061
+ vae_batch = gr.Slider(label="VAE Batch Size for Input Video", minimum=4, maximum=256, value=default_vae, step=4, info="Reduce if running out of memory. Increase for better quality frames during fast motion.")
1062
 
1063
+
1064
+ gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
1065
+
1066
+ mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
1067
+ batch = gr.Slider(label="Batch Size (Number of Videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed.')
1068
+ with gr.Row():
1069
+ randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
1070
+ seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
 
 
 
 
 
 
 
 
 
 
 
1071
 
1072
  with gr.Column():
1073
  preview_image = gr.Image(label="Next Latents", height=200, visible=False)
 
1075
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
1076
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1077
 
1078
+ # 20250506 pftq: Updated inputs to include num_clean_frames
1079
+ ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf]
1080
+ ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1081
+
1082
+ gr.Examples(
1083
+ label = "Examples from image",
1084
+ examples = [
1085
+ [
1086
+ "./img_examples/Example1.png", # input_image
1087
+ "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1088
+ "image", # generation_mode
1089
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1090
+ True, # randomize_seed
1091
+ 42, # seed
1092
+ 672, # resolution
1093
+ 1, # total_second_length
1094
+ 9, # latent_window_size
1095
+ 25, # steps
1096
+ 1.0, # cfg
1097
+ 10.0, # gs
1098
+ 0.0, # rs
1099
+ 6, # gpu_memory_preservation
1100
+ False, # enable_preview
1101
+ True, # use_teacache
1102
+ 16 # mp4_crf
1103
+ ],
1104
+ [
1105
+ "./img_examples/Example2.webp", # input_image
1106
+ "A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1107
+ "image", # generation_mode
1108
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1109
+ True, # randomize_seed
1110
+ 42, # seed
1111
+ 672, # resolution
1112
+ 2, # total_second_length
1113
+ 9, # latent_window_size
1114
+ 25, # steps
1115
+ 1.0, # cfg
1116
+ 10.0, # gs
1117
+ 0.0, # rs
1118
+ 6, # gpu_memory_preservation
1119
+ False, # enable_preview
1120
+ True, # use_teacache
1121
+ 16 # mp4_crf
1122
+ ],
1123
+ [
1124
+ "./img_examples/Example2.webp", # input_image
1125
+ "A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A black man on the left and an Asian woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
1126
+ "image", # generation_mode
1127
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1128
+ True, # randomize_seed
1129
+ 42, # seed
1130
+ 672, # resolution
1131
+ 2, # total_second_length
1132
+ 9, # latent_window_size
1133
+ 25, # steps
1134
+ 1.0, # cfg
1135
+ 10.0, # gs
1136
+ 0.0, # rs
1137
+ 6, # gpu_memory_preservation
1138
+ False, # enable_preview
1139
+ True, # use_teacache
1140
+ 16 # mp4_crf
1141
+ ],
1142
+ [
1143
+ "./img_examples/Example3.jpg", # input_image
1144
+ "A boy is walking to the right, full view, full-length view, cartoon",
1145
+ "image", # generation_mode
1146
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1147
+ True, # randomize_seed
1148
+ 42, # seed
1149
+ 672, # resolution
1150
+ 1, # total_second_length
1151
+ 9, # latent_window_size
1152
+ 25, # steps
1153
+ 1.0, # cfg
1154
+ 10.0, # gs
1155
+ 0.0, # rs
1156
+ 6, # gpu_memory_preservation
1157
+ False, # enable_preview
1158
+ True, # use_teacache
1159
+ 16 # mp4_crf
1160
+ ]
1161
+ ],
1162
+ run_on_click = True,
1163
+ fn = process,
1164
+ inputs = ips,
1165
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
1166
+ cache_examples = False,
1167
+ )
1168
+
1169
+ gr.Examples(
1170
+ label = "Examples from video",
1171
+ examples = [
1172
+ [
1173
+ "./img_examples/Example1.mp4", # input_video
1174
+ "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1175
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1176
+ True, # randomize_seed
1177
+ 42, # seed
1178
+ 1, # batch
1179
+ 672, # resolution
1180
+ 1, # total_second_length
1181
+ 9, # latent_window_size
1182
+ 25, # steps
1183
+ 1.0, # cfg
1184
+ 10.0, # gs
1185
+ 0.0, # rs
1186
+ 6, # gpu_memory_preservation
1187
+ False, # enable_preview
1188
+ True, # use_teacache
1189
+ False, # no_resize
1190
+ 16, # mp4_crf
1191
+ 5, # num_clean_frames
1192
+ default_vae
1193
+ ]
1194
+ ],
1195
+ run_on_click = True,
1196
+ fn = process_video,
1197
+ inputs = ips_video,
1198
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
1199
+ cache_examples = False,
1200
+ )
1201
+
1202
+ def save_preferences(preferences, value):
1203
+ preferences["generation-mode"] = value
1204
+ return preferences
1205
+
1206
+ def load_preferences(saved_prefs):
1207
+ saved_prefs = init_preferences(saved_prefs)
1208
+ return saved_prefs["generation-mode"]
1209
+
1210
+ def init_preferences(saved_prefs):
1211
+ if saved_prefs is None:
1212
+ saved_prefs = default_local_storage
1213
+ return saved_prefs
1214
+
1215
+ def check_parameters(generation_mode, input_image, input_video):
1216
+ if generation_mode == "image" and input_image is None:
1217
+ raise gr.Error("Please provide an image to extend.")
1218
+ if generation_mode == "video" and input_video is None:
1219
+ raise gr.Error("Please provide a video to extend.")
1220
+ return gr.update(interactive=True)
1221
+
1222
+ def handle_generation_mode_change(generation_mode_data):
1223
+ if generation_mode_data == "text":
1224
+ return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
1225
+ elif generation_mode_data == "image":
1226
+ return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
1227
+ elif generation_mode_data == "video":
1228
+ return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
1229
 
1230
+
1231
+ prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
1232
+ timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
1233
+ start_button.click(fn = check_parameters, inputs = [
1234
+ generation_mode, input_image, input_video
1235
+ ], outputs = [end_button], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
1236
+ start_button_video.click(fn = check_parameters, inputs = [
1237
+ generation_mode, input_image, input_video
1238
+ ], outputs = [end_button], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
1239
  end_button.click(fn=end_process)
1240
 
1241
+ generation_mode.change(fn = save_preferences, inputs = [
1242
+ local_storage,
1243
+ generation_mode,
1244
+ ], outputs = [
1245
+ local_storage
1246
+ ])
1247
+
1248
+ generation_mode.change(
1249
+ fn=handle_generation_mode_change,
1250
+ inputs=[generation_mode],
1251
+ outputs=[text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint]
1252
+ )
1253
+
1254
+ # Update display when the page loads
1255
+ block.load(
1256
+ fn=handle_generation_mode_change, inputs = [
1257
+ generation_mode
1258
+ ], outputs = [
1259
+ text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint
1260
+ ]
1261
+ )
1262
+
1263
+ # Load saved preferences when the page loads
1264
+ block.load(
1265
+ fn=load_preferences, inputs = [
1266
+ local_storage
1267
+ ], outputs = [
1268
+ generation_mode
1269
+ ]
1270
+ )
1271
+
1272
+ block.launch(mcp_server=True, ssr_mode=False)