Spaces:
Running
on
T4
Running
on
T4
DeepBeepMeep
commited on
Commit
·
6d57bc7
1
Parent(s):
26a6613
fixed vace bugs
Browse files- wan/diffusion_forcing.py +6 -3
- wan/image2video.py +7 -13
- wan/modules/model.py +1 -1
- wan/text2video.py +6 -4
- wan/utils/utils.py +10 -0
- wan/utils/vace_preprocessor.py +7 -37
- wgp.py +225 -198
wan/diffusion_forcing.py
CHANGED
@@ -15,6 +15,7 @@ from .modules.model import WanModel
|
|
15 |
from .modules.t5 import T5EncoderModel
|
16 |
from .modules.vae import WanVAE
|
17 |
from wan.modules.posemb_layers import get_rotary_pos_embed
|
|
|
18 |
from .utils.fm_solvers import (FlowDPMSolverMultistepScheduler,
|
19 |
get_sampling_sigmas, retrieve_timesteps)
|
20 |
from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
|
@@ -190,6 +191,7 @@ class DTT2V:
|
|
190 |
input_video = None,
|
191 |
height: int = 480,
|
192 |
width: int = 832,
|
|
|
193 |
num_frames: int = 97,
|
194 |
num_inference_steps: int = 50,
|
195 |
shift: float = 1.0,
|
@@ -221,15 +223,16 @@ class DTT2V:
|
|
221 |
i2v_extra_kwrags = {}
|
222 |
prefix_video = None
|
223 |
predix_video_latent_length = 0
|
|
|
224 |
if input_video != None:
|
225 |
_ , _ , height, width = input_video.shape
|
226 |
elif image != None:
|
227 |
image = image[0]
|
228 |
frame_width, frame_height = image.size
|
229 |
-
|
230 |
-
height = (int(frame_height * scale) // 16) * 16
|
231 |
-
width = (int(frame_width * scale) // 16) * 16
|
232 |
image = np.array(image.resize((width, height))).transpose(2, 0, 1)
|
|
|
|
|
233 |
latent_length = (num_frames - 1) // 4 + 1
|
234 |
latent_height = height // 8
|
235 |
latent_width = width // 8
|
|
|
15 |
from .modules.t5 import T5EncoderModel
|
16 |
from .modules.vae import WanVAE
|
17 |
from wan.modules.posemb_layers import get_rotary_pos_embed
|
18 |
+
from wan.utils.utils import calculate_new_dimensions
|
19 |
from .utils.fm_solvers import (FlowDPMSolverMultistepScheduler,
|
20 |
get_sampling_sigmas, retrieve_timesteps)
|
21 |
from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
|
|
|
191 |
input_video = None,
|
192 |
height: int = 480,
|
193 |
width: int = 832,
|
194 |
+
fit_into_canvas = True,
|
195 |
num_frames: int = 97,
|
196 |
num_inference_steps: int = 50,
|
197 |
shift: float = 1.0,
|
|
|
223 |
i2v_extra_kwrags = {}
|
224 |
prefix_video = None
|
225 |
predix_video_latent_length = 0
|
226 |
+
|
227 |
if input_video != None:
|
228 |
_ , _ , height, width = input_video.shape
|
229 |
elif image != None:
|
230 |
image = image[0]
|
231 |
frame_width, frame_height = image.size
|
232 |
+
height, width = calculate_new_dimensions(height, width, frame_height, frame_width, fit_into_canvas)
|
|
|
|
|
233 |
image = np.array(image.resize((width, height))).transpose(2, 0, 1)
|
234 |
+
|
235 |
+
|
236 |
latent_length = (num_frames - 1) // 4 + 1
|
237 |
latent_height = height // 8
|
238 |
latent_width = width // 8
|
wan/image2video.py
CHANGED
@@ -25,7 +25,7 @@ from .utils.fm_solvers import (FlowDPMSolverMultistepScheduler,
|
|
25 |
get_sampling_sigmas, retrieve_timesteps)
|
26 |
from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
|
27 |
from wan.modules.posemb_layers import get_rotary_pos_embed
|
28 |
-
from wan.utils.utils import resize_lanczos
|
29 |
|
30 |
def optimized_scale(positive_flat, negative_flat):
|
31 |
|
@@ -120,7 +120,7 @@ class WanI2V:
|
|
120 |
img2 = None,
|
121 |
height =720,
|
122 |
width = 1280,
|
123 |
-
|
124 |
frame_num=81,
|
125 |
shift=5.0,
|
126 |
sample_solver='unipc',
|
@@ -188,22 +188,16 @@ class WanI2V:
|
|
188 |
if add_frames_for_end_image:
|
189 |
frame_num +=1
|
190 |
lat_frames = int((frame_num - 2) // self.vae_stride[0] + 2)
|
191 |
-
|
192 |
-
|
193 |
h, w = img.shape[1:]
|
194 |
-
# aspect_ratio = h / w
|
195 |
-
|
196 |
-
scale1 = min(height / h, width / w)
|
197 |
-
scale2 = min(height / h, width / w)
|
198 |
-
scale = max(scale1, scale2)
|
199 |
-
new_height = int(h * scale)
|
200 |
-
new_width = int(w * scale)
|
201 |
|
|
|
|
|
202 |
lat_h = round(
|
203 |
-
|
204 |
self.patch_size[1] * self.patch_size[1])
|
205 |
lat_w = round(
|
206 |
-
|
207 |
self.patch_size[2] * self.patch_size[2])
|
208 |
h = lat_h * self.vae_stride[1]
|
209 |
w = lat_w * self.vae_stride[2]
|
|
|
25 |
get_sampling_sigmas, retrieve_timesteps)
|
26 |
from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
|
27 |
from wan.modules.posemb_layers import get_rotary_pos_embed
|
28 |
+
from wan.utils.utils import resize_lanczos, calculate_new_dimensions
|
29 |
|
30 |
def optimized_scale(positive_flat, negative_flat):
|
31 |
|
|
|
120 |
img2 = None,
|
121 |
height =720,
|
122 |
width = 1280,
|
123 |
+
fit_into_canvas = True,
|
124 |
frame_num=81,
|
125 |
shift=5.0,
|
126 |
sample_solver='unipc',
|
|
|
188 |
if add_frames_for_end_image:
|
189 |
frame_num +=1
|
190 |
lat_frames = int((frame_num - 2) // self.vae_stride[0] + 2)
|
191 |
+
|
|
|
192 |
h, w = img.shape[1:]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
+
h, w = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
|
195 |
+
|
196 |
lat_h = round(
|
197 |
+
h // self.vae_stride[1] //
|
198 |
self.patch_size[1] * self.patch_size[1])
|
199 |
lat_w = round(
|
200 |
+
w // self.vae_stride[2] //
|
201 |
self.patch_size[2] * self.patch_size[2])
|
202 |
h = lat_h * self.vae_stride[1]
|
203 |
w = lat_w * self.vae_stride[2]
|
wan/modules/model.py
CHANGED
@@ -963,7 +963,7 @@ class WanModel(ModelMixin, ConfigMixin):
|
|
963 |
hints_list = [None ] *len(x_list)
|
964 |
else:
|
965 |
# Vace embeddings
|
966 |
-
c = [self.vace_patch_embedding(u.unsqueeze(0)) for u in vace_context]
|
967 |
c = [u.flatten(2).transpose(1, 2) for u in c]
|
968 |
c = c[0]
|
969 |
|
|
|
963 |
hints_list = [None ] *len(x_list)
|
964 |
else:
|
965 |
# Vace embeddings
|
966 |
+
c = [self.vace_patch_embedding(u.to(self.vace_patch_embedding.weight.dtype).unsqueeze(0)) for u in vace_context]
|
967 |
c = [u.flatten(2).transpose(1, 2) for u in c]
|
968 |
c = c[0]
|
969 |
|
wan/text2video.py
CHANGED
@@ -177,15 +177,16 @@ class WanT2V:
|
|
177 |
def vace_latent(self, z, m):
|
178 |
return [torch.cat([zz, mm], dim=0) for zz, mm in zip(z, m)]
|
179 |
|
180 |
-
def prepare_source(self, src_video, src_mask, src_ref_images, total_frames, image_size, device, original_video = False, keep_frames= [], start_frame = 0, pre_src_video = None):
|
181 |
image_sizes = []
|
182 |
trim_video = len(keep_frames)
|
|
|
183 |
|
184 |
for i, (sub_src_video, sub_src_mask, sub_pre_src_video) in enumerate(zip(src_video, src_mask,pre_src_video)):
|
185 |
prepend_count = 0 if sub_pre_src_video == None else sub_pre_src_video.shape[1]
|
186 |
num_frames = total_frames - prepend_count
|
187 |
if sub_src_mask is not None and sub_src_video is not None:
|
188 |
-
src_video[i], src_mask[i], _, _, _ = self.vid_proc.load_video_pair(sub_src_video, sub_src_mask, max_frames= num_frames, trim_video = trim_video - prepend_count, start_frame = start_frame)
|
189 |
# src_video is [-1, 1], 0 = inpainting area (in fact 127 in [0, 255])
|
190 |
# src_mask is [-1, 1], 0 = preserve original video (in fact 127 in [0, 255]) and 1 = Inpainting (in fact 255 in [0, 255])
|
191 |
src_video[i] = src_video[i].to(device)
|
@@ -208,7 +209,7 @@ class WanT2V:
|
|
208 |
src_mask[i] = torch.ones_like(src_video[i], device=device)
|
209 |
image_sizes.append(image_size)
|
210 |
else:
|
211 |
-
src_video[i], _, _, _ = self.vid_proc.load_video(sub_src_video, max_frames= num_frames, trim_video = trim_video - prepend_count, start_frame = start_frame)
|
212 |
src_video[i] = src_video[i].to(device)
|
213 |
src_mask[i] = torch.zeros_like(src_video[i], device=device) if original_video else torch.ones_like(src_video[i], device=device)
|
214 |
if prepend_count > 0:
|
@@ -277,6 +278,7 @@ class WanT2V:
|
|
277 |
target_camera=None,
|
278 |
context_scale=1.0,
|
279 |
size=(1280, 720),
|
|
|
280 |
frame_num=81,
|
281 |
shift=5.0,
|
282 |
sample_solver='unipc',
|
@@ -430,7 +432,7 @@ class WanT2V:
|
|
430 |
kwargs.update({'cam_emb': cam_emb})
|
431 |
|
432 |
if vace:
|
433 |
-
ref_images_count = len(input_ref_images[0]) if input_ref_images != None else 0
|
434 |
kwargs.update({'vace_context' : z, 'vace_context_scale' : context_scale})
|
435 |
if overlapped_latents > 0:
|
436 |
z_reactive = [ zz[0:16, ref_images_count:overlapped_latents + ref_images_count].clone() for zz in z]
|
|
|
177 |
def vace_latent(self, z, m):
|
178 |
return [torch.cat([zz, mm], dim=0) for zz, mm in zip(z, m)]
|
179 |
|
180 |
+
def prepare_source(self, src_video, src_mask, src_ref_images, total_frames, image_size, device, original_video = False, keep_frames= [], start_frame = 0, fit_into_canvas = True, pre_src_video = None):
|
181 |
image_sizes = []
|
182 |
trim_video = len(keep_frames)
|
183 |
+
canvas_height, canvas_width = image_size
|
184 |
|
185 |
for i, (sub_src_video, sub_src_mask, sub_pre_src_video) in enumerate(zip(src_video, src_mask,pre_src_video)):
|
186 |
prepend_count = 0 if sub_pre_src_video == None else sub_pre_src_video.shape[1]
|
187 |
num_frames = total_frames - prepend_count
|
188 |
if sub_src_mask is not None and sub_src_video is not None:
|
189 |
+
src_video[i], src_mask[i], _, _, _ = self.vid_proc.load_video_pair(sub_src_video, sub_src_mask, max_frames= num_frames, trim_video = trim_video - prepend_count, start_frame = start_frame, canvas_height = canvas_height, canvas_width = canvas_width, fit_into_canvas = fit_into_canvas)
|
190 |
# src_video is [-1, 1], 0 = inpainting area (in fact 127 in [0, 255])
|
191 |
# src_mask is [-1, 1], 0 = preserve original video (in fact 127 in [0, 255]) and 1 = Inpainting (in fact 255 in [0, 255])
|
192 |
src_video[i] = src_video[i].to(device)
|
|
|
209 |
src_mask[i] = torch.ones_like(src_video[i], device=device)
|
210 |
image_sizes.append(image_size)
|
211 |
else:
|
212 |
+
src_video[i], _, _, _ = self.vid_proc.load_video(sub_src_video, max_frames= num_frames, trim_video = trim_video - prepend_count, start_frame = start_frame, canvas_height = canvas_height, canvas_width = canvas_width, fit_into_canvas = fit_into_canvas)
|
213 |
src_video[i] = src_video[i].to(device)
|
214 |
src_mask[i] = torch.zeros_like(src_video[i], device=device) if original_video else torch.ones_like(src_video[i], device=device)
|
215 |
if prepend_count > 0:
|
|
|
278 |
target_camera=None,
|
279 |
context_scale=1.0,
|
280 |
size=(1280, 720),
|
281 |
+
fit_into_canvas = True,
|
282 |
frame_num=81,
|
283 |
shift=5.0,
|
284 |
sample_solver='unipc',
|
|
|
432 |
kwargs.update({'cam_emb': cam_emb})
|
433 |
|
434 |
if vace:
|
435 |
+
ref_images_count = len(input_ref_images[0]) if input_ref_images != None and input_ref_images[0] != None else 0
|
436 |
kwargs.update({'vace_context' : z, 'vace_context_scale' : context_scale})
|
437 |
if overlapped_latents > 0:
|
438 |
z_reactive = [ zz[0:16, ref_images_count:overlapped_latents + ref_images_count].clone() for zz in z]
|
wan/utils/utils.py
CHANGED
@@ -67,7 +67,17 @@ def remove_background(img, session=None):
|
|
67 |
return torch.from_numpy(np.array(img).astype(np.float32) / 255.0).movedim(-1, 0)
|
68 |
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
|
|
|
|
|
|
71 |
|
72 |
def resize_and_remove_background(img_list, budget_width, budget_height, rm_background, fit_into_canvas = False ):
|
73 |
if rm_background:
|
|
|
67 |
return torch.from_numpy(np.array(img).astype(np.float32) / 255.0).movedim(-1, 0)
|
68 |
|
69 |
|
70 |
+
def calculate_new_dimensions(canvas_height, canvas_width, height, width, fit_into_canvas):
|
71 |
+
if fit_into_canvas:
|
72 |
+
scale1 = min(canvas_height / height, canvas_width / width)
|
73 |
+
scale2 = min(canvas_width / height, canvas_height / width)
|
74 |
+
scale = max(scale1, scale2)
|
75 |
+
else:
|
76 |
+
scale = (canvas_height * canvas_width / (height * width))**(1/2)
|
77 |
|
78 |
+
new_height = round( height * scale / 16) * 16
|
79 |
+
new_width = round( width * scale / 16) * 16
|
80 |
+
return new_height, new_width
|
81 |
|
82 |
def resize_and_remove_background(img_list, budget_width, budget_height, rm_background, fit_into_canvas = False ):
|
83 |
if rm_background:
|
wan/utils/vace_preprocessor.py
CHANGED
@@ -5,6 +5,7 @@ from PIL import Image
|
|
5 |
import torch
|
6 |
import torch.nn.functional as F
|
7 |
import torchvision.transforms.functional as TF
|
|
|
8 |
|
9 |
|
10 |
class VaceImageProcessor(object):
|
@@ -182,53 +183,22 @@ class VaceVideoProcessor(object):
|
|
182 |
|
183 |
|
184 |
|
185 |
-
def _get_frameid_bbox_adjust_last(self, fps, video_frames_count, h, w, crop_box, rng, max_frames= 0, start_frame =0):
|
186 |
from wan.utils.utils import resample
|
187 |
|
188 |
target_fps = self.max_fps
|
189 |
|
190 |
-
# video_frames_count = len(frame_timestamps)
|
191 |
-
|
192 |
frame_ids= resample(fps, video_frames_count, max_frames, target_fps, start_frame )
|
193 |
|
194 |
x1, x2, y1, y2 = [0, w, 0, h] if crop_box is None else crop_box
|
195 |
h, w = y2 - y1, x2 - x1
|
196 |
-
|
197 |
-
df, dh, dw = self.downsample
|
198 |
-
seq_len = self.seq_len
|
199 |
-
# min/max area of the [latent video]
|
200 |
-
min_area_z = self.min_area / (dh * dw)
|
201 |
-
# max_area_z = min(seq_len, self.max_area / (dh * dw), (h // dh) * (w // dw))
|
202 |
-
max_area_z = min_area_z # workaround bug
|
203 |
-
# sample a frame number of the [latent video]
|
204 |
-
rand_area_z = np.square(np.power(2, rng.uniform(
|
205 |
-
np.log2(np.sqrt(min_area_z)),
|
206 |
-
np.log2(np.sqrt(max_area_z))
|
207 |
-
)))
|
208 |
-
|
209 |
-
seq_len = max_area_z * ((max_frames- start_frame - 1) // df +1)
|
210 |
-
|
211 |
-
# of = min(
|
212 |
-
# (len(frame_ids) - 1) // df + 1,
|
213 |
-
# int(seq_len / rand_area_z)
|
214 |
-
# )
|
215 |
-
of = (len(frame_ids) - 1) // df + 1
|
216 |
-
|
217 |
-
|
218 |
-
# deduce target shape of the [latent video]
|
219 |
-
# target_area_z = min(max_area_z, int(seq_len / of))
|
220 |
-
target_area_z = max_area_z
|
221 |
-
oh = round(np.sqrt(target_area_z * ratio))
|
222 |
-
ow = int(target_area_z / oh)
|
223 |
-
of = (of - 1) * df + 1
|
224 |
-
oh *= dh
|
225 |
-
ow *= dw
|
226 |
|
227 |
return frame_ids, (x1, x2, y1, y2), (oh, ow), target_fps
|
228 |
|
229 |
-
def _get_frameid_bbox(self, fps, video_frames_count, h, w, crop_box, rng, max_frames= 0, start_frame= 0):
|
230 |
if self.keep_last:
|
231 |
-
return self._get_frameid_bbox_adjust_last(fps, video_frames_count, h, w, crop_box, rng, max_frames= max_frames, start_frame= start_frame)
|
232 |
else:
|
233 |
return self._get_frameid_bbox_default(fps, video_frames_count, h, w, crop_box, rng, max_frames= max_frames)
|
234 |
|
@@ -238,7 +208,7 @@ class VaceVideoProcessor(object):
|
|
238 |
def load_video_pair(self, data_key, data_key2, crop_box=None, seed=2024, **kwargs):
|
239 |
return self.load_video_batch(data_key, data_key2, crop_box=crop_box, seed=seed, **kwargs)
|
240 |
|
241 |
-
def load_video_batch(self, *data_key_batch, crop_box=None, seed=2024, max_frames= 0, trim_video =0, start_frame = 0, **kwargs):
|
242 |
rng = np.random.default_rng(seed + hash(data_key_batch[0]) % 10000)
|
243 |
# read video
|
244 |
import decord
|
@@ -269,7 +239,7 @@ class VaceVideoProcessor(object):
|
|
269 |
h, w = src_video.shape[1:3]
|
270 |
else:
|
271 |
h, w = readers[0].next().shape[:2]
|
272 |
-
frame_ids, (x1, x2, y1, y2), (oh, ow), fps = self._get_frameid_bbox(fps, length, h, w, crop_box, rng, max_frames=max_frames, start_frame = start_frame )
|
273 |
|
274 |
# preprocess video
|
275 |
videos = [reader.get_batch(frame_ids)[:, y1:y2, x1:x2, :] for reader in readers]
|
|
|
5 |
import torch
|
6 |
import torch.nn.functional as F
|
7 |
import torchvision.transforms.functional as TF
|
8 |
+
from .utils import calculate_new_dimensions
|
9 |
|
10 |
|
11 |
class VaceImageProcessor(object):
|
|
|
183 |
|
184 |
|
185 |
|
186 |
+
def _get_frameid_bbox_adjust_last(self, fps, video_frames_count, canvas_height, canvas_width, h, w, fit_into_canvas, crop_box, rng, max_frames= 0, start_frame =0):
|
187 |
from wan.utils.utils import resample
|
188 |
|
189 |
target_fps = self.max_fps
|
190 |
|
|
|
|
|
191 |
frame_ids= resample(fps, video_frames_count, max_frames, target_fps, start_frame )
|
192 |
|
193 |
x1, x2, y1, y2 = [0, w, 0, h] if crop_box is None else crop_box
|
194 |
h, w = y2 - y1, x2 - x1
|
195 |
+
oh, ow = calculate_new_dimensions(canvas_height, canvas_width, h, w, fit_into_canvas)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
return frame_ids, (x1, x2, y1, y2), (oh, ow), target_fps
|
198 |
|
199 |
+
def _get_frameid_bbox(self, fps, video_frames_count, h, w, crop_box, rng, max_frames= 0, start_frame= 0, canvas_height = 0, canvas_width = 0, fit_into_canvas= True):
|
200 |
if self.keep_last:
|
201 |
+
return self._get_frameid_bbox_adjust_last(fps, video_frames_count, canvas_height, canvas_width, h, w, fit_into_canvas, crop_box, rng, max_frames= max_frames, start_frame= start_frame)
|
202 |
else:
|
203 |
return self._get_frameid_bbox_default(fps, video_frames_count, h, w, crop_box, rng, max_frames= max_frames)
|
204 |
|
|
|
208 |
def load_video_pair(self, data_key, data_key2, crop_box=None, seed=2024, **kwargs):
|
209 |
return self.load_video_batch(data_key, data_key2, crop_box=crop_box, seed=seed, **kwargs)
|
210 |
|
211 |
+
def load_video_batch(self, *data_key_batch, crop_box=None, seed=2024, max_frames= 0, trim_video =0, start_frame = 0, canvas_height = 0, canvas_width = 0, fit_into_canvas = False, **kwargs):
|
212 |
rng = np.random.default_rng(seed + hash(data_key_batch[0]) % 10000)
|
213 |
# read video
|
214 |
import decord
|
|
|
239 |
h, w = src_video.shape[1:3]
|
240 |
else:
|
241 |
h, w = readers[0].next().shape[:2]
|
242 |
+
frame_ids, (x1, x2, y1, y2), (oh, ow), fps = self._get_frameid_bbox(fps, length, h, w, crop_box, rng, canvas_height = canvas_height, canvas_width = canvas_width, fit_into_canvas = fit_into_canvas, max_frames=max_frames, start_frame = start_frame )
|
243 |
|
244 |
# preprocess video
|
245 |
videos = [reader.get_batch(frame_ids)[:, y1:y2, x1:x2, :] for reader in readers]
|
wgp.py
CHANGED
@@ -84,7 +84,6 @@ def format_time(seconds):
|
|
84 |
hours = int(seconds // 3600)
|
85 |
minutes = int((seconds % 3600) // 60)
|
86 |
return f"{hours}h {minutes}m"
|
87 |
-
|
88 |
def pil_to_base64_uri(pil_image, format="png", quality=75):
|
89 |
if pil_image is None:
|
90 |
return None
|
@@ -275,12 +274,12 @@ def process_prompt_and_add_tasks(state, model_choice):
|
|
275 |
video_guide = inputs["video_guide"]
|
276 |
video_mask = inputs["video_mask"]
|
277 |
|
278 |
-
if "1.3B" in model_filename :
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
if "I" in video_prompt_type:
|
285 |
if image_refs == None:
|
286 |
gr.Info("You must provide at least one Refererence Image")
|
@@ -1995,7 +1994,8 @@ def apply_changes( state,
|
|
1995 |
boost_choice = 1,
|
1996 |
clear_file_list = 0,
|
1997 |
preload_model_policy_choice = 1,
|
1998 |
-
UI_theme_choice = "default"
|
|
|
1999 |
):
|
2000 |
if args.lock_config:
|
2001 |
return
|
@@ -2016,7 +2016,8 @@ def apply_changes( state,
|
|
2016 |
"boost" : boost_choice,
|
2017 |
"clear_file_list" : clear_file_list,
|
2018 |
"preload_model_policy" : preload_model_policy_choice,
|
2019 |
-
"UI_theme" : UI_theme_choice
|
|
|
2020 |
}
|
2021 |
|
2022 |
if Path(server_config_filename).is_file():
|
@@ -2050,7 +2051,7 @@ def apply_changes( state,
|
|
2050 |
transformer_quantization = server_config["transformer_quantization"]
|
2051 |
transformer_types = server_config["transformer_types"]
|
2052 |
|
2053 |
-
if all(change in ["attention_mode", "vae_config", "boost", "save_path", "metadata_type", "clear_file_list"] for change in changes ):
|
2054 |
model_choice = gr.Dropdown()
|
2055 |
else:
|
2056 |
reload_needed = True
|
@@ -2413,7 +2414,7 @@ def generate_video(
|
|
2413 |
file_list = gen["file_list"]
|
2414 |
prompt_no = gen["prompt_no"]
|
2415 |
|
2416 |
-
|
2417 |
# if wan_model == None:
|
2418 |
# gr.Info("Unable to generate a Video while a new configuration is being applied.")
|
2419 |
# return
|
@@ -2555,7 +2556,7 @@ def generate_video(
|
|
2555 |
source_video = None
|
2556 |
target_camera = None
|
2557 |
if "recam" in model_filename:
|
2558 |
-
source_video = preprocess_video("", width=width, height=height,video_in=video_source, max_frames= video_length, start_frame = 0, fit_canvas=
|
2559 |
target_camera = model_mode
|
2560 |
|
2561 |
audio_proj_split = None
|
@@ -2646,7 +2647,7 @@ def generate_video(
|
|
2646 |
elif diffusion_forcing:
|
2647 |
if video_source != None and len(video_source) > 0 and window_no == 1:
|
2648 |
keep_frames_video_source= 1000 if len(keep_frames_video_source) ==0 else int(keep_frames_video_source)
|
2649 |
-
prefix_video = preprocess_video(None, width=width, height=height,video_in=video_source, max_frames= keep_frames_video_source , start_frame = 0, fit_canvas=
|
2650 |
prefix_video = prefix_video .permute(3, 0, 1, 2)
|
2651 |
prefix_video = prefix_video .float().div_(127.5).sub_(1.) # c, f, h, w
|
2652 |
prefix_video_frames_count = prefix_video.shape[1]
|
@@ -2675,13 +2676,13 @@ def generate_video(
|
|
2675 |
|
2676 |
if preprocess_type != None :
|
2677 |
send_cmd("progress", progress_args)
|
2678 |
-
video_guide_copy = preprocess_video(preprocess_type, width=width, height=height,video_in=video_guide, max_frames= video_length if window_no == 1 else video_length - reuse_frames, start_frame = guide_start_frame, fit_canvas =
|
2679 |
keep_frames_parsed, error = parse_keep_frames_video_guide(keep_frames_video_guide, max_frames_to_generate)
|
2680 |
if len(error) > 0:
|
2681 |
raise gr.Error(f"invalid keep frames {keep_frames_video_guide}")
|
2682 |
keep_frames_parsed = keep_frames_parsed[guide_start_frame: guide_start_frame + video_length]
|
2683 |
if window_no == 1:
|
2684 |
-
image_size = VACE_SIZE_CONFIGS[resolution_reformated] # default frame dimensions until it is set by video_src (if there is any)
|
2685 |
src_video, src_mask, src_ref_images = wan_model.prepare_source([video_guide_copy],
|
2686 |
[video_mask_copy ],
|
2687 |
[image_refs_copy],
|
@@ -2689,10 +2690,11 @@ def generate_video(
|
|
2689 |
original_video= "O" in video_prompt_type,
|
2690 |
keep_frames=keep_frames_parsed,
|
2691 |
start_frame = guide_start_frame,
|
2692 |
-
pre_src_video = [pre_video_guide]
|
|
|
2693 |
)
|
2694 |
-
if window_no == 1 and src_video != None and len(src_video) > 0:
|
2695 |
-
|
2696 |
prompts_max = gen["prompts_max"]
|
2697 |
status = get_latest_status(state)
|
2698 |
|
@@ -2722,6 +2724,7 @@ def generate_video(
|
|
2722 |
# max_area=MAX_AREA_CONFIGS[resolution_reformated],
|
2723 |
height = height,
|
2724 |
width = width,
|
|
|
2725 |
shift=flow_shift,
|
2726 |
sampling_steps=num_inference_steps,
|
2727 |
guide_scale=guidance_scale,
|
@@ -2750,6 +2753,7 @@ def generate_video(
|
|
2750 |
input_video= pre_video_guide,
|
2751 |
height = height,
|
2752 |
width = width,
|
|
|
2753 |
seed = seed,
|
2754 |
num_frames = (video_length // 4)* 4 + 1, #377
|
2755 |
num_inference_steps = num_inference_steps,
|
@@ -2777,6 +2781,7 @@ def generate_video(
|
|
2777 |
target_camera= target_camera,
|
2778 |
frame_num=(video_length // 4)* 4 + 1,
|
2779 |
size=(width, height),
|
|
|
2780 |
shift=flow_shift,
|
2781 |
sampling_steps=num_inference_steps,
|
2782 |
guide_scale=guidance_scale,
|
@@ -4042,39 +4047,35 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
|
|
4042 |
wizard_prompt_activated_var = gr.Text(wizard_prompt_activated, visible= False)
|
4043 |
wizard_variables_var = gr.Text(wizard_variables, visible = False)
|
4044 |
with gr.Row():
|
4045 |
-
if test_class_i2v(model_filename)
|
4046 |
-
|
4047 |
-
|
4048 |
-
|
4049 |
-
|
4050 |
-
("480p (same amount of pixels)", "832x480"),
|
4051 |
-
],
|
4052 |
-
value=ui_defaults.get("resolution","480p"),
|
4053 |
-
label="Resolution (video will have the same height / width ratio than the original image)"
|
4054 |
-
)
|
4055 |
else:
|
4056 |
-
|
4057 |
-
|
4058 |
-
|
4059 |
-
|
4060 |
-
|
4061 |
-
|
4062 |
-
|
4063 |
-
|
4064 |
-
|
4065 |
-
|
4066 |
-
|
4067 |
-
|
4068 |
-
|
4069 |
-
|
4070 |
-
|
4071 |
-
|
4072 |
-
|
4073 |
-
|
4074 |
-
|
4075 |
-
|
4076 |
-
|
4077 |
-
|
|
|
4078 |
with gr.Row():
|
4079 |
if recammaster:
|
4080 |
video_length = gr.Slider(5, 193, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (16 = 1s), locked", interactive= False)
|
@@ -4556,156 +4557,181 @@ def generate_configuration_tab(state, blocks, header, model_choice):
|
|
4556 |
with gr.Column():
|
4557 |
model_list = []
|
4558 |
|
4559 |
-
for model_type in model_types:
|
4560 |
-
choice = get_model_filename(model_type, transformer_quantization)
|
4561 |
-
model_list.append(choice)
|
4562 |
-
dropdown_choices = [ ( get_model_name(choice), get_model_type(choice) ) for choice in model_list]
|
4563 |
-
transformer_types_choices = gr.Dropdown(
|
4564 |
-
choices= dropdown_choices,
|
4565 |
-
value= transformer_types,
|
4566 |
-
label= "Selectable Wan Transformer Models (keep empty to get All of them)",
|
4567 |
-
scale= 2,
|
4568 |
-
multiselect= True
|
4569 |
-
)
|
4570 |
|
4571 |
-
|
4572 |
-
|
4573 |
-
|
4574 |
-
|
4575 |
-
|
4576 |
-
|
4577 |
-
|
4578 |
-
|
4579 |
-
|
4580 |
-
|
4581 |
-
|
4582 |
-
|
4583 |
-
|
4584 |
-
|
4585 |
-
|
4586 |
-
|
4587 |
-
|
4588 |
-
|
4589 |
-
|
4590 |
-
|
4591 |
-
|
4592 |
-
|
4593 |
-
|
4594 |
-
|
4595 |
-
|
4596 |
-
|
4597 |
-
|
4598 |
-
|
4599 |
-
|
4600 |
-
|
4601 |
-
|
4602 |
-
|
4603 |
-
|
4604 |
-
|
4605 |
-
|
4606 |
-
|
4607 |
-
|
4608 |
-
|
4609 |
-
|
4610 |
-
|
4611 |
-
|
4612 |
-
|
4613 |
-
|
4614 |
-
|
4615 |
-
|
4616 |
-
|
4617 |
-
|
4618 |
-
|
4619 |
-
|
4620 |
-
|
4621 |
-
|
4622 |
-
|
4623 |
-
|
4624 |
-
|
4625 |
-
|
4626 |
-
|
4627 |
-
|
4628 |
-
|
4629 |
-
|
4630 |
-
|
4631 |
-
|
4632 |
-
|
4633 |
-
|
4634 |
-
|
4635 |
-
|
4636 |
-
|
4637 |
-
|
4638 |
-
|
4639 |
-
|
4640 |
-
|
4641 |
-
|
4642 |
-
|
4643 |
-
|
4644 |
-
|
4645 |
-
|
4646 |
-
|
4647 |
-
|
4648 |
-
|
4649 |
-
|
4650 |
-
|
4651 |
-
|
4652 |
-
|
4653 |
-
|
4654 |
-
|
4655 |
-
|
4656 |
-
|
4657 |
-
|
4658 |
-
|
4659 |
-
|
4660 |
-
|
4661 |
-
|
4662 |
-
|
4663 |
-
|
4664 |
-
|
4665 |
-
|
4666 |
-
|
4667 |
-
|
4668 |
-
|
4669 |
-
|
4670 |
-
|
4671 |
-
|
4672 |
-
|
4673 |
-
|
4674 |
-
|
4675 |
-
|
4676 |
-
|
4677 |
-
|
4678 |
-
|
4679 |
-
|
4680 |
-
|
4681 |
-
|
4682 |
-
|
4683 |
-
|
4684 |
-
|
4685 |
-
|
4686 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4687 |
|
4688 |
-
clear_file_list_choice = gr.Dropdown(
|
4689 |
-
choices=[
|
4690 |
-
("None", 0),
|
4691 |
-
("Keep the last video", 1),
|
4692 |
-
("Keep the last 5 videos", 5),
|
4693 |
-
("Keep the last 10 videos", 10),
|
4694 |
-
("Keep the last 20 videos", 20),
|
4695 |
-
("Keep the last 30 videos", 30),
|
4696 |
-
],
|
4697 |
-
value=server_config.get("clear_file_list", 5),
|
4698 |
-
label="Keep Previously Generated Videos when starting a Generation Batch"
|
4699 |
-
)
|
4700 |
|
4701 |
-
UI_theme_choice = gr.Dropdown(
|
4702 |
-
choices=[
|
4703 |
-
("Blue Sky", "default"),
|
4704 |
-
("Classic Gradio", "gradio"),
|
4705 |
-
],
|
4706 |
-
value=server_config.get("UI_theme_choice", "default"),
|
4707 |
-
label="User Interface Theme. You will need to restart the App the see new Theme."
|
4708 |
-
)
|
4709 |
|
4710 |
|
4711 |
msg = gr.Markdown()
|
@@ -4728,7 +4754,8 @@ def generate_configuration_tab(state, blocks, header, model_choice):
|
|
4728 |
boost_choice,
|
4729 |
clear_file_list_choice,
|
4730 |
preload_model_policy_choice,
|
4731 |
-
UI_theme_choice
|
|
|
4732 |
],
|
4733 |
outputs= [msg , header, model_choice]
|
4734 |
)
|
|
|
84 |
hours = int(seconds // 3600)
|
85 |
minutes = int((seconds % 3600) // 60)
|
86 |
return f"{hours}h {minutes}m"
|
|
|
87 |
def pil_to_base64_uri(pil_image, format="png", quality=75):
|
88 |
if pil_image is None:
|
89 |
return None
|
|
|
274 |
video_guide = inputs["video_guide"]
|
275 |
video_mask = inputs["video_mask"]
|
276 |
|
277 |
+
# if "1.3B" in model_filename :
|
278 |
+
# resolution_reformated = str(height) + "*" + str(width)
|
279 |
+
# if not resolution_reformated in VACE_SIZE_CONFIGS:
|
280 |
+
# res = (" and ").join(VACE_SIZE_CONFIGS.keys())
|
281 |
+
# gr.Info(f"Video Resolution for Vace model is not supported. Only {res} resolutions are allowed.")
|
282 |
+
# return
|
283 |
if "I" in video_prompt_type:
|
284 |
if image_refs == None:
|
285 |
gr.Info("You must provide at least one Refererence Image")
|
|
|
1994 |
boost_choice = 1,
|
1995 |
clear_file_list = 0,
|
1996 |
preload_model_policy_choice = 1,
|
1997 |
+
UI_theme_choice = "default",
|
1998 |
+
fit_canvas_choice = 0
|
1999 |
):
|
2000 |
if args.lock_config:
|
2001 |
return
|
|
|
2016 |
"boost" : boost_choice,
|
2017 |
"clear_file_list" : clear_file_list,
|
2018 |
"preload_model_policy" : preload_model_policy_choice,
|
2019 |
+
"UI_theme" : UI_theme_choice,
|
2020 |
+
"fit_canvas": fit_canvas_choice,
|
2021 |
}
|
2022 |
|
2023 |
if Path(server_config_filename).is_file():
|
|
|
2051 |
transformer_quantization = server_config["transformer_quantization"]
|
2052 |
transformer_types = server_config["transformer_types"]
|
2053 |
|
2054 |
+
if all(change in ["attention_mode", "vae_config", "boost", "save_path", "metadata_type", "clear_file_list", "fit_canvas"] for change in changes ):
|
2055 |
model_choice = gr.Dropdown()
|
2056 |
else:
|
2057 |
reload_needed = True
|
|
|
2414 |
file_list = gen["file_list"]
|
2415 |
prompt_no = gen["prompt_no"]
|
2416 |
|
2417 |
+
fit_canvas = server_config.get("fit_canvas", 0)
|
2418 |
# if wan_model == None:
|
2419 |
# gr.Info("Unable to generate a Video while a new configuration is being applied.")
|
2420 |
# return
|
|
|
2556 |
source_video = None
|
2557 |
target_camera = None
|
2558 |
if "recam" in model_filename:
|
2559 |
+
source_video = preprocess_video("", width=width, height=height,video_in=video_source, max_frames= video_length, start_frame = 0, fit_canvas= fit_canvas)
|
2560 |
target_camera = model_mode
|
2561 |
|
2562 |
audio_proj_split = None
|
|
|
2647 |
elif diffusion_forcing:
|
2648 |
if video_source != None and len(video_source) > 0 and window_no == 1:
|
2649 |
keep_frames_video_source= 1000 if len(keep_frames_video_source) ==0 else int(keep_frames_video_source)
|
2650 |
+
prefix_video = preprocess_video(None, width=width, height=height,video_in=video_source, max_frames= keep_frames_video_source , start_frame = 0, fit_canvas= fit_canvas, target_fps = fps)
|
2651 |
prefix_video = prefix_video .permute(3, 0, 1, 2)
|
2652 |
prefix_video = prefix_video .float().div_(127.5).sub_(1.) # c, f, h, w
|
2653 |
prefix_video_frames_count = prefix_video.shape[1]
|
|
|
2676 |
|
2677 |
if preprocess_type != None :
|
2678 |
send_cmd("progress", progress_args)
|
2679 |
+
video_guide_copy = preprocess_video(preprocess_type, width=width, height=height,video_in=video_guide, max_frames= video_length if window_no == 1 else video_length - reuse_frames, start_frame = guide_start_frame, fit_canvas = fit_canvas, target_fps = fps)
|
2680 |
keep_frames_parsed, error = parse_keep_frames_video_guide(keep_frames_video_guide, max_frames_to_generate)
|
2681 |
if len(error) > 0:
|
2682 |
raise gr.Error(f"invalid keep frames {keep_frames_video_guide}")
|
2683 |
keep_frames_parsed = keep_frames_parsed[guide_start_frame: guide_start_frame + video_length]
|
2684 |
if window_no == 1:
|
2685 |
+
image_size = (height, width) # VACE_SIZE_CONFIGS[resolution_reformated] # default frame dimensions until it is set by video_src (if there is any)
|
2686 |
src_video, src_mask, src_ref_images = wan_model.prepare_source([video_guide_copy],
|
2687 |
[video_mask_copy ],
|
2688 |
[image_refs_copy],
|
|
|
2690 |
original_video= "O" in video_prompt_type,
|
2691 |
keep_frames=keep_frames_parsed,
|
2692 |
start_frame = guide_start_frame,
|
2693 |
+
pre_src_video = [pre_video_guide],
|
2694 |
+
fit_into_canvas = fit_canvas
|
2695 |
)
|
2696 |
+
# if window_no == 1 and src_video != None and len(src_video) > 0:
|
2697 |
+
# image_size = src_video[0].shape[-2:]
|
2698 |
prompts_max = gen["prompts_max"]
|
2699 |
status = get_latest_status(state)
|
2700 |
|
|
|
2724 |
# max_area=MAX_AREA_CONFIGS[resolution_reformated],
|
2725 |
height = height,
|
2726 |
width = width,
|
2727 |
+
fit_into_canvas = fit_canvas,
|
2728 |
shift=flow_shift,
|
2729 |
sampling_steps=num_inference_steps,
|
2730 |
guide_scale=guidance_scale,
|
|
|
2753 |
input_video= pre_video_guide,
|
2754 |
height = height,
|
2755 |
width = width,
|
2756 |
+
fit_into_canvas = fit_canvas,
|
2757 |
seed = seed,
|
2758 |
num_frames = (video_length // 4)* 4 + 1, #377
|
2759 |
num_inference_steps = num_inference_steps,
|
|
|
2781 |
target_camera= target_camera,
|
2782 |
frame_num=(video_length // 4)* 4 + 1,
|
2783 |
size=(width, height),
|
2784 |
+
fit_into_canvas = fit_canvas,
|
2785 |
shift=flow_shift,
|
2786 |
sampling_steps=num_inference_steps,
|
2787 |
guide_scale=guidance_scale,
|
|
|
4047 |
wizard_prompt_activated_var = gr.Text(wizard_prompt_activated, visible= False)
|
4048 |
wizard_variables_var = gr.Text(wizard_variables, visible = False)
|
4049 |
with gr.Row():
|
4050 |
+
if test_class_i2v(model_filename):
|
4051 |
+
if server_config.get("fit_canvas", 0) == 1:
|
4052 |
+
label = "Max Resolution (as it maybe less depending on video width / height ratio)"
|
4053 |
+
else:
|
4054 |
+
label = "Max Resolution (as it maybe less depending on video width / height ratio)"
|
|
|
|
|
|
|
|
|
|
|
4055 |
else:
|
4056 |
+
label = "Max Resolution (as it maybe less depending on video width / height ratio)"
|
4057 |
+
resolution = gr.Dropdown(
|
4058 |
+
choices=[
|
4059 |
+
# 720p
|
4060 |
+
("1280x720 (16:9, 720p)", "1280x720"),
|
4061 |
+
("720x1280 (9:16, 720p)", "720x1280"),
|
4062 |
+
("1024x1024 (4:3, 720p)", "1024x024"),
|
4063 |
+
("832x1104 (3:4, 720p)", "832x1104"),
|
4064 |
+
("1104x832 (3:4, 720p)", "1104x832"),
|
4065 |
+
("960x960 (1:1, 720p)", "960x960"),
|
4066 |
+
# 480p
|
4067 |
+
("960x544 (16:9, 540p)", "960x544"),
|
4068 |
+
("544x960 (16:9, 540p)", "544x960"),
|
4069 |
+
("832x480 (16:9, 480p)", "832x480"),
|
4070 |
+
("480x832 (9:16, 480p)", "480x832"),
|
4071 |
+
("832x624 (4:3, 480p)", "832x624"),
|
4072 |
+
("624x832 (3:4, 480p)", "624x832"),
|
4073 |
+
("720x720 (1:1, 480p)", "720x720"),
|
4074 |
+
("512x512 (1:1, 480p)", "512x512"),
|
4075 |
+
],
|
4076 |
+
value=ui_defaults.get("resolution","832x480"),
|
4077 |
+
label= label
|
4078 |
+
)
|
4079 |
with gr.Row():
|
4080 |
if recammaster:
|
4081 |
video_length = gr.Slider(5, 193, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (16 = 1s), locked", interactive= False)
|
|
|
4557 |
with gr.Column():
|
4558 |
model_list = []
|
4559 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4560 |
|
4561 |
+
with gr.Tabs():
|
4562 |
+
# with gr.Row(visible=advanced_ui) as advanced_row:
|
4563 |
+
with gr.Tab("General"):
|
4564 |
+
for model_type in model_types:
|
4565 |
+
choice = get_model_filename(model_type, transformer_quantization)
|
4566 |
+
model_list.append(choice)
|
4567 |
+
dropdown_choices = [ ( get_model_name(choice), get_model_type(choice) ) for choice in model_list]
|
4568 |
+
transformer_types_choices = gr.Dropdown(
|
4569 |
+
choices= dropdown_choices,
|
4570 |
+
value= transformer_types,
|
4571 |
+
label= "Selectable Wan Transformer Models (keep empty to get All of them)",
|
4572 |
+
scale= 2,
|
4573 |
+
multiselect= True
|
4574 |
+
)
|
4575 |
+
|
4576 |
+
fit_canvas_choice = gr.Dropdown(
|
4577 |
+
choices=[
|
4578 |
+
("Dimensions correspond to the Pixels Budget (as the Prompt Image/Video will be resized to match this pixels budget, output video height or width may exceed the requested dimensions )", 0),
|
4579 |
+
("Dimensions correspond to the Maximum Width and Height (as the Prompt Image/Video will be resized to fit into these dimensions, the output video may be smaller)", 1),
|
4580 |
+
],
|
4581 |
+
value= server_config.get("fit_canvas", 0),
|
4582 |
+
label="Generated Video Dimensions when Prompt contains an Image or a Video",
|
4583 |
+
interactive= not lock_ui_attention
|
4584 |
+
)
|
4585 |
+
|
4586 |
+
|
4587 |
+
def check(mode):
|
4588 |
+
if not mode in attention_modes_installed:
|
4589 |
+
return " (NOT INSTALLED)"
|
4590 |
+
elif not mode in attention_modes_supported:
|
4591 |
+
return " (NOT SUPPORTED)"
|
4592 |
+
else:
|
4593 |
+
return ""
|
4594 |
+
attention_choice = gr.Dropdown(
|
4595 |
+
choices=[
|
4596 |
+
("Auto : pick sage2 > sage > sdpa depending on what is installed", "auto"),
|
4597 |
+
("Scale Dot Product Attention: default, always available", "sdpa"),
|
4598 |
+
("Flash" + check("flash")+ ": good quality - requires additional install (usually complex to set up on Windows without WSL)", "flash"),
|
4599 |
+
("Xformers" + check("xformers")+ ": good quality - requires additional install (usually complex, may consume less VRAM to set up on Windows without WSL)", "xformers"),
|
4600 |
+
("Sage" + check("sage")+ ": 30% faster but slightly worse quality - requires additional install (usually complex to set up on Windows without WSL)", "sage"),
|
4601 |
+
("Sage2" + check("sage2")+ ": 40% faster but slightly worse quality - requires additional install (usually complex to set up on Windows without WSL)", "sage2"),
|
4602 |
+
],
|
4603 |
+
value= attention_mode,
|
4604 |
+
label="Attention Type",
|
4605 |
+
interactive= not lock_ui_attention
|
4606 |
+
)
|
4607 |
+
|
4608 |
+
|
4609 |
+
metadata_choice = gr.Dropdown(
|
4610 |
+
choices=[
|
4611 |
+
("Export JSON files", "json"),
|
4612 |
+
("Add metadata to video", "metadata"),
|
4613 |
+
("Neither", "none")
|
4614 |
+
],
|
4615 |
+
value=server_config.get("metadata_type", "metadata"),
|
4616 |
+
label="Metadata Handling"
|
4617 |
+
)
|
4618 |
+
preload_model_policy_choice = gr.CheckboxGroup([("Preload Model while Launching the App","P"), ("Preload Model while Switching Model", "S"), ("Unload Model when Queue is Done", "U")],
|
4619 |
+
value=server_config.get("preload_model_policy",[]),
|
4620 |
+
label="RAM Loading / Unloading Model Policy (in any case VRAM will be freed once the queue has been processed)"
|
4621 |
+
)
|
4622 |
+
|
4623 |
+
clear_file_list_choice = gr.Dropdown(
|
4624 |
+
choices=[
|
4625 |
+
("None", 0),
|
4626 |
+
("Keep the last video", 1),
|
4627 |
+
("Keep the last 5 videos", 5),
|
4628 |
+
("Keep the last 10 videos", 10),
|
4629 |
+
("Keep the last 20 videos", 20),
|
4630 |
+
("Keep the last 30 videos", 30),
|
4631 |
+
],
|
4632 |
+
value=server_config.get("clear_file_list", 5),
|
4633 |
+
label="Keep Previously Generated Videos when starting a new Generation Batch"
|
4634 |
+
)
|
4635 |
+
|
4636 |
+
UI_theme_choice = gr.Dropdown(
|
4637 |
+
choices=[
|
4638 |
+
("Blue Sky", "default"),
|
4639 |
+
("Classic Gradio", "gradio"),
|
4640 |
+
],
|
4641 |
+
value=server_config.get("UI_theme_choice", "default"),
|
4642 |
+
label="User Interface Theme. You will need to restart the App the see new Theme."
|
4643 |
+
)
|
4644 |
+
|
4645 |
+
save_path_choice = gr.Textbox(
|
4646 |
+
label="Output Folder for Generated Videos",
|
4647 |
+
value=server_config.get("save_path", save_path)
|
4648 |
+
)
|
4649 |
+
|
4650 |
+
with gr.Tab("Performance"):
|
4651 |
+
|
4652 |
+
quantization_choice = gr.Dropdown(
|
4653 |
+
choices=[
|
4654 |
+
("Scaled Int8 Quantization (recommended)", "int8"),
|
4655 |
+
("16 bits (no quantization)", "bf16"),
|
4656 |
+
],
|
4657 |
+
value= transformer_quantization,
|
4658 |
+
label="Wan Transformer Model Quantization Type (if available)",
|
4659 |
+
)
|
4660 |
+
|
4661 |
+
mixed_precision_choice = gr.Dropdown(
|
4662 |
+
choices=[
|
4663 |
+
("16 bits only, requires less VRAM", "0"),
|
4664 |
+
("Mixed 16 / 32 bits, slightly more VRAM needed but better Quality", "1"),
|
4665 |
+
],
|
4666 |
+
value= server_config.get("mixed_precision", "0"),
|
4667 |
+
label="Transformer Engine Calculation"
|
4668 |
+
)
|
4669 |
+
|
4670 |
+
index = text_encoder_choices.index(text_encoder_filename)
|
4671 |
+
index = 0 if index ==0 else index
|
4672 |
+
text_encoder_choice = gr.Dropdown(
|
4673 |
+
choices=[
|
4674 |
+
("UMT5 XXL 16 bits - unquantized text encoder, better quality uses more RAM", 0),
|
4675 |
+
("UMT5 XXL quantized to 8 bits - quantized text encoder, slightly worse quality but uses less RAM", 1),
|
4676 |
+
],
|
4677 |
+
value= index,
|
4678 |
+
label="Text Encoder model"
|
4679 |
+
)
|
4680 |
+
|
4681 |
+
VAE_precision_choice = gr.Dropdown(
|
4682 |
+
choices=[
|
4683 |
+
("16 bits, requires less VRAM and faster", "16"),
|
4684 |
+
("32 bits, requires twice more VRAM and slower but recommended with Window Sliding", "32"),
|
4685 |
+
],
|
4686 |
+
value= server_config.get("vae_precision", "16"),
|
4687 |
+
label="VAE Encoding / Decoding precision"
|
4688 |
+
)
|
4689 |
+
|
4690 |
+
gr.Text("Beware: when restarting the server or changing a resolution or video duration, the first step of generation for a duration / resolution may last a few minutes due to recompilation", interactive= False, show_label= False )
|
4691 |
+
compile_choice = gr.Dropdown(
|
4692 |
+
choices=[
|
4693 |
+
("ON: works only on Linux / WSL", "transformer"),
|
4694 |
+
("OFF: no other choice if you have Windows without using WSL", "" ),
|
4695 |
+
],
|
4696 |
+
value= compile,
|
4697 |
+
label="Compile Transformer (up to 50% faster and 30% more frames but requires Linux / WSL and Flash or Sage attention)",
|
4698 |
+
interactive= not lock_ui_compile
|
4699 |
+
)
|
4700 |
+
|
4701 |
+
vae_config_choice = gr.Dropdown(
|
4702 |
+
choices=[
|
4703 |
+
("Auto", 0),
|
4704 |
+
("Disabled (faster but may require up to 22 GB of VRAM)", 1),
|
4705 |
+
("256 x 256 : If at least 8 GB of VRAM", 2),
|
4706 |
+
("128 x 128 : If at least 6 GB of VRAM", 3),
|
4707 |
+
],
|
4708 |
+
value= vae_config,
|
4709 |
+
label="VAE Tiling - reduce the high VRAM requirements for VAE decoding and VAE encoding (if enabled it will be slower)"
|
4710 |
+
)
|
4711 |
+
|
4712 |
+
boost_choice = gr.Dropdown(
|
4713 |
+
choices=[
|
4714 |
+
# ("Auto (ON if Video longer than 5s)", 0),
|
4715 |
+
("ON", 1),
|
4716 |
+
("OFF", 2),
|
4717 |
+
],
|
4718 |
+
value=boost,
|
4719 |
+
label="Boost: Give a 10% speedup without losing quality at the cost of a litle VRAM (up to 1GB at max frames and resolution)"
|
4720 |
+
)
|
4721 |
+
|
4722 |
+
profile_choice = gr.Dropdown(
|
4723 |
+
choices=[
|
4724 |
+
("HighRAM_HighVRAM, profile 1: at least 48 GB of RAM and 24 GB of VRAM, the fastest for short videos a RTX 3090 / RTX 4090", 1),
|
4725 |
+
("HighRAM_LowVRAM, profile 2 (Recommended): at least 48 GB of RAM and 12 GB of VRAM, the most versatile profile with high RAM, better suited for RTX 3070/3080/4070/4080 or for RTX 3090 / RTX 4090 with large pictures batches or long videos", 2),
|
4726 |
+
("LowRAM_HighVRAM, profile 3: at least 32 GB of RAM and 24 GB of VRAM, adapted for RTX 3090 / RTX 4090 with limited RAM for good speed short video",3),
|
4727 |
+
("LowRAM_LowVRAM, profile 4 (Default): at least 32 GB of RAM and 12 GB of VRAM, if you have little VRAM or want to generate longer videos",4),
|
4728 |
+
("VerylowRAM_LowVRAM, profile 5: (Fail safe): at least 16 GB of RAM and 10 GB of VRAM, if you don't have much it won't be fast but maybe it will work",5)
|
4729 |
+
],
|
4730 |
+
value= profile,
|
4731 |
+
label="Profile (for power users only, not needed to change it)"
|
4732 |
+
)
|
4733 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4734 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4735 |
|
4736 |
|
4737 |
msg = gr.Markdown()
|
|
|
4754 |
boost_choice,
|
4755 |
clear_file_list_choice,
|
4756 |
preload_model_policy_choice,
|
4757 |
+
UI_theme_choice,
|
4758 |
+
fit_canvas_choice
|
4759 |
],
|
4760 |
outputs= [msg , header, model_choice]
|
4761 |
)
|