diff --git a/.gitignore b/.gitignore
index 1f975d806c46fb441688b684dcea085ffdda9bf8..d95eb333ea755acab125e3af6d4d130020ae5f54 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@
*.html
*.pdf
*.whl
+*.exe
cache
__pycache__/
storage/
@@ -34,3 +35,8 @@ Wan2.1-T2V-14B/
Wan2.1-T2V-1.3B/
Wan2.1-I2V-14B-480P/
Wan2.1-I2V-14B-720P/
+outputs/
+gradio_outputs/
+ckpts/
+loras/
+loras_i2v/
diff --git a/README.md b/README.md
index 2c49a1545ccaf69166c3deb4fee1ad503ac8fb97..82156c4f3fb369981a38f0481a9a3033717f4747 100644
--- a/README.md
+++ b/README.md
@@ -2,14 +2,35 @@
-----
-Wan2.1 GP by DeepBeepMeep based on Wan2.1's Alibaba: Open and Advanced Large-Scale Video Generative Models for the GPU Poor
+WanGP by DeepBeepMeep : The best Open Source Video Generative Models Accessible to the GPU Poor
-**NEW Discord Server to get Help from Other Users and show your Best Videos:** https://discord.gg/g7efUW9jGV
+WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models with:
+- Low VRAM requirements (as low as 6 GB of VRAM is sufficient for certain models)
+- Support for old GPUs (RTX 10XX, 20xx, ...)
+- Very Fast on the latest GPUs
+- Easy to use Full Web based interface
+- Auto download of the required model adapted to your specific architecture
+- Tools integrated to facilitate Video Generation : Mask Editor, Prompt Enhancer, Temporal and Spatial Generation
+- Loras Support to customize each model
+- Queuing system : make your shopping list of videos to generate and come back later
+
+
+**Discord Server to get Help from Other Users and show your Best Videos:** https://discord.gg/g7efUW9jGV
## 🔥 Latest News!!
+* May 17 2025: 👋 Wan 2.1GP v5.0 : One App to Rule Them All !\
+ Added support for the other great open source architectures:
+ - Hunyuan Video : text 2 video (one of the best, if not the best t2v) ,image 2 video and the recently released Hunyuan Custom (very good identify preservation when injecting a person into a video)
+ - LTX Video 13B (released last week): very long video support and fast 720p generation.Wan GP version has been greatly optimzed and reduced VRAM requirements by 4 !
+
+ Also:
+ - Added supported for the best Control Video Model, released 2 days ago : Vace 14B
+ - New Integrated prompt enhancer to increase the quality of the generated videos
+ You will need one more *pip install -r requirements.txt*
+
* May 5 2025: 👋 Wan 2.1GP v4.5: FantasySpeaking model, you can animate a talking head using a voice track. This works not only on people but also on objects. Also better seamless transitions between Vace sliding windows for very long videos (see recommended settings). New high quality processing features (mixed 16/32 bits calculation and 32 bitsVAE)
* April 27 2025: 👋 Wan 2.1GP v4.4: Phantom model support, very good model to transfer people or objects into video, works quite well at 720p and with the number of steps > 30
* April 25 2025: 👋 Wan 2.1GP v4.3: Added preview mode and support for Sky Reels v2 Diffusion Forcing for high quality "infinite length videos" (see Window Sliding section below).Note that Skyreel uses causal attention that is only supported by Sdpa attention so even if chose an other type of attention, some of the processes will use Sdpa attention.
@@ -71,30 +92,6 @@ If you upgrade you will need to do a 'pip install -r requirements.txt' again.
* Feb 27, 2025: 👋 Wan2.1 has been integrated into [ComfyUI](https://comfyanonymous.github.io/ComfyUI_examples/wan/). Enjoy!
-## Features
-*GPU Poor version by **DeepBeepMeep**. This great video generator can now run smoothly on any GPU.*
-
-This version has the following improvements over the original Alibaba model:
-- Reduce greatly the RAM requirements and VRAM requirements
-- Much faster thanks to compilation and fast loading / unloading
-- Multiple profiles in order to able to run the model at a decent speed on a low end consumer config (32 GB of RAM and 12 VRAM) and to run it at a very good speed on a high end consumer config (48 GB of RAM and 24 GB of VRAM)
-- Autodownloading of the needed model files
-- Improved gradio interface with progression bar and more options
-- Multiples prompts / multiple generations per prompt
-- Support multiple pretrained Loras with 32 GB of RAM or less
-- Much simpler installation
-
-
-This fork by DeepBeepMeep is an integration of the mmpg module on the original model
-
-It is an illustration on how one can set up on an existing model some fast and properly working CPU offloading with changing only a few lines of code in the core model.
-
-For more information on how to use the mmpg module, please go to: https://github.com/deepbeepmeep/mmgp
-
-You will find the original Wan2.1 Video repository here: https://github.com/Wan-Video/Wan2.1
-
-
-
## Installation Guide for Linux and Windows for GPUs up to RTX40xx
@@ -182,11 +179,11 @@ To run the text to video generator (in Low VRAM mode):
```bash
python wgp.py
#or
-python wgp.py --t2v #launch the default text 2 video model
+python wgp.py --t2v #launch the default Wan text 2 video model
#or
-python wgp.py --t2v-14B #for the 14B model
+python wgp.py --t2v-14B #for the Wan 14B model
#or
-python wgp.py --t2v-1-3B #for the 1.3B model
+python wgp.py --t2v-1-3B #for the Wan 1.3B model
```
@@ -227,17 +224,23 @@ python wgp.py --attention sdpa
### Loras support
-Every lora stored in the subfoler 'loras' for t2v and 'loras_i2v' will be automatically loaded. You will be then able to activate / desactive any of them when running the application by selecting them in the area below "Activated Loras" .
+Lora for the Wan models are stored in the subfoler 'loras' for t2v and 'loras_i2v'. You will be then able to activate / desactive any of them when running the application by selecting them in the Advanced Tab "Loras" .
-If you want to manage in different areas Loras for the 1.3B model and the 14B as they are not compatible, just create the following subfolders:
+If you want to manage in different areas Loras for the 1.3B model and the 14B of Wan t2v models (as they are not compatible), just create the following subfolders:
- loras/1.3B
- loras/14B
-You can also put all the loras in the same place by launching the app with following command line (*path* is a path to shared loras directory):
+You can also put all the loras in the same place by launching the app with the following command line (*path* is a path to shared loras directory):
```
python wgp.exe --lora-dir path --lora-dir-i2v path
```
+Hunyuan Video and LTX Video models have also their own loras subfolders:
+-loras_hunyuan
+-loras_hunyuan_i2v
+-loras_ltxv
+
+
For each activated Lora, you may specify a *multiplier* that is one float number that corresponds to its weight (default is 1.0) .The multipliers for each Lora should be separated by a space character or a carriage return. For instance:\
*1.2 0.8* means that the first lora will have a 1.2 multiplier and the second one will have 0.8.
@@ -342,7 +345,11 @@ Experimental: if your prompt is broken into multiple lines (each line separated
--i2v-1-3B : launch the Fun InP 1.3B model image to video generator\
--vace : launch the Vace ControlNet 1.3B model image to video generator\
--quantize-transformer bool: (default True) : enable / disable on the fly transformer quantization\
---lora-dir path : Path of directory that contains Loras in diffusers / safetensor format\
+--lora-dir path : Path of directory that contains Wan t2v Loras\
+--lora-dir-i2v path : Path of directory that contains Wan i2v Loras\
+--lora-dir-hunyuan path : Path of directory that contains Hunyuan t2v Loras\
+--lora-dir-hunyuan-i2v path : Path of directory that contains Hunyuan i2v Loras\
+--lora-dir-ltxv path : Path of directory that contains LTX Video Loras\
--lora-preset preset : name of preset gile (without the extension) to preload
--verbose level : default (1) : level of information between 0 and 2\
--server-port portno : default (7860) : Gradio port no\
diff --git a/hyvideo/__init__.py b/hyvideo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/hyvideo/config.py b/hyvideo/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..192bfa41ecdf77bee334f3e7cc44161bc4f69c1f
--- /dev/null
+++ b/hyvideo/config.py
@@ -0,0 +1,534 @@
+import argparse
+from .constants import *
+import re
+from .modules.models import HUNYUAN_VIDEO_CONFIG
+
+
+def parse_args(namespace=None):
+ parser = argparse.ArgumentParser(description="HunyuanVideo inference script")
+
+ parser = add_network_args(parser)
+ parser = add_extra_models_args(parser)
+ parser = add_denoise_schedule_args(parser)
+ parser = add_inference_args(parser)
+ parser = add_parallel_args(parser)
+
+ args = parser.parse_args(namespace=namespace)
+ args = sanity_check_args(args)
+
+ return args
+
+
+def add_network_args(parser: argparse.ArgumentParser):
+ group = parser.add_argument_group(title="HunyuanVideo network args")
+
+
+ group.add_argument(
+ "--quantize-transformer",
+ action="store_true",
+ help="On the fly 'transformer' quantization"
+ )
+
+
+ group.add_argument(
+ "--lora-dir-i2v",
+ type=str,
+ default="loras_i2v",
+ help="Path to a directory that contains Loras for i2v"
+ )
+
+
+ group.add_argument(
+ "--lora-dir",
+ type=str,
+ default="",
+ help="Path to a directory that contains Loras"
+ )
+
+
+ group.add_argument(
+ "--lora-preset",
+ type=str,
+ default="",
+ help="Lora preset to preload"
+ )
+
+ # group.add_argument(
+ # "--lora-preset-i2v",
+ # type=str,
+ # default="",
+ # help="Lora preset to preload for i2v"
+ # )
+
+ group.add_argument(
+ "--profile",
+ type=str,
+ default=-1,
+ help="Profile No"
+ )
+
+ group.add_argument(
+ "--verbose",
+ type=str,
+ default=1,
+ help="Verbose level"
+ )
+
+ group.add_argument(
+ "--server-port",
+ type=str,
+ default=0,
+ help="Server port"
+ )
+
+ group.add_argument(
+ "--server-name",
+ type=str,
+ default="",
+ help="Server name"
+ )
+
+ group.add_argument(
+ "--open-browser",
+ action="store_true",
+ help="open browser"
+ )
+
+ group.add_argument(
+ "--t2v",
+ action="store_true",
+ help="text to video mode"
+ )
+
+ group.add_argument(
+ "--i2v",
+ action="store_true",
+ help="image to video mode"
+ )
+
+ group.add_argument(
+ "--compile",
+ action="store_true",
+ help="Enable pytorch compilation"
+ )
+
+ group.add_argument(
+ "--fast",
+ action="store_true",
+ help="use Fast HunyuanVideo model"
+ )
+
+ group.add_argument(
+ "--fastest",
+ action="store_true",
+ help="activate the best config"
+ )
+
+ group.add_argument(
+ "--attention",
+ type=str,
+ default="",
+ help="attention mode"
+ )
+
+ group.add_argument(
+ "--vae-config",
+ type=str,
+ default="",
+ help="vae config mode"
+ )
+
+ parser.add_argument(
+ "--share",
+ action="store_true",
+ help="Create a shared URL to access webserver remotely"
+ )
+
+ parser.add_argument(
+ "--lock-config",
+ action="store_true",
+ help="Prevent modifying the configuration from the web interface"
+ )
+
+ parser.add_argument(
+ "--preload",
+ type=str,
+ default="0",
+ help="Megabytes of the diffusion model to preload in VRAM"
+ )
+
+ parser.add_argument(
+ "--multiple-images",
+ action="store_true",
+ help="Allow inputting multiple images with image to video"
+ )
+
+
+ # Main model
+ group.add_argument(
+ "--model",
+ type=str,
+ choices=list(HUNYUAN_VIDEO_CONFIG.keys()),
+ default="HYVideo-T/2-cfgdistill",
+ )
+ group.add_argument(
+ "--latent-channels",
+ type=str,
+ default=16,
+ help="Number of latent channels of DiT. If None, it will be determined by `vae`. If provided, "
+ "it still needs to match the latent channels of the VAE model.",
+ )
+ group.add_argument(
+ "--precision",
+ type=str,
+ default="bf16",
+ choices=PRECISIONS,
+ help="Precision mode. Options: fp32, fp16, bf16. Applied to the backbone model and optimizer.",
+ )
+
+ # RoPE
+ group.add_argument(
+ "--rope-theta", type=int, default=256, help="Theta used in RoPE."
+ )
+ return parser
+
+
+def add_extra_models_args(parser: argparse.ArgumentParser):
+ group = parser.add_argument_group(
+ title="Extra models args, including vae, text encoders and tokenizers)"
+ )
+
+ # - VAE
+ group.add_argument(
+ "--vae",
+ type=str,
+ default="884-16c-hy",
+ choices=list(VAE_PATH),
+ help="Name of the VAE model.",
+ )
+ group.add_argument(
+ "--vae-precision",
+ type=str,
+ default="fp16",
+ choices=PRECISIONS,
+ help="Precision mode for the VAE model.",
+ )
+ group.add_argument(
+ "--vae-tiling",
+ action="store_true",
+ help="Enable tiling for the VAE model to save GPU memory.",
+ )
+ group.set_defaults(vae_tiling=True)
+
+ group.add_argument(
+ "--text-encoder",
+ type=str,
+ default="llm",
+ choices=list(TEXT_ENCODER_PATH),
+ help="Name of the text encoder model.",
+ )
+ group.add_argument(
+ "--text-encoder-precision",
+ type=str,
+ default="fp16",
+ choices=PRECISIONS,
+ help="Precision mode for the text encoder model.",
+ )
+ group.add_argument(
+ "--text-states-dim",
+ type=int,
+ default=4096,
+ help="Dimension of the text encoder hidden states.",
+ )
+ group.add_argument(
+ "--text-len", type=int, default=256, help="Maximum length of the text input."
+ )
+ group.add_argument(
+ "--tokenizer",
+ type=str,
+ default="llm",
+ choices=list(TOKENIZER_PATH),
+ help="Name of the tokenizer model.",
+ )
+ group.add_argument(
+ "--prompt-template",
+ type=str,
+ default="dit-llm-encode",
+ choices=PROMPT_TEMPLATE,
+ help="Image prompt template for the decoder-only text encoder model.",
+ )
+ group.add_argument(
+ "--prompt-template-video",
+ type=str,
+ default="dit-llm-encode-video",
+ choices=PROMPT_TEMPLATE,
+ help="Video prompt template for the decoder-only text encoder model.",
+ )
+ group.add_argument(
+ "--hidden-state-skip-layer",
+ type=int,
+ default=2,
+ help="Skip layer for hidden states.",
+ )
+ group.add_argument(
+ "--apply-final-norm",
+ action="store_true",
+ help="Apply final normalization to the used text encoder hidden states.",
+ )
+
+ # - CLIP
+ group.add_argument(
+ "--text-encoder-2",
+ type=str,
+ default="clipL",
+ choices=list(TEXT_ENCODER_PATH),
+ help="Name of the second text encoder model.",
+ )
+ group.add_argument(
+ "--text-encoder-precision-2",
+ type=str,
+ default="fp16",
+ choices=PRECISIONS,
+ help="Precision mode for the second text encoder model.",
+ )
+ group.add_argument(
+ "--text-states-dim-2",
+ type=int,
+ default=768,
+ help="Dimension of the second text encoder hidden states.",
+ )
+ group.add_argument(
+ "--tokenizer-2",
+ type=str,
+ default="clipL",
+ choices=list(TOKENIZER_PATH),
+ help="Name of the second tokenizer model.",
+ )
+ group.add_argument(
+ "--text-len-2",
+ type=int,
+ default=77,
+ help="Maximum length of the second text input.",
+ )
+
+ return parser
+
+
+def add_denoise_schedule_args(parser: argparse.ArgumentParser):
+ group = parser.add_argument_group(title="Denoise schedule args")
+
+ group.add_argument(
+ "--denoise-type",
+ type=str,
+ default="flow",
+ help="Denoise type for noised inputs.",
+ )
+
+ # Flow Matching
+ group.add_argument(
+ "--flow-shift",
+ type=float,
+ default=7.0,
+ help="Shift factor for flow matching schedulers.",
+ )
+ group.add_argument(
+ "--flow-reverse",
+ action="store_true",
+ help="If reverse, learning/sampling from t=1 -> t=0.",
+ )
+ group.add_argument(
+ "--flow-solver",
+ type=str,
+ default="euler",
+ help="Solver for flow matching.",
+ )
+ group.add_argument(
+ "--use-linear-quadratic-schedule",
+ action="store_true",
+ help="Use linear quadratic schedule for flow matching."
+ "Following MovieGen (https://ai.meta.com/static-resource/movie-gen-research-paper)",
+ )
+ group.add_argument(
+ "--linear-schedule-end",
+ type=int,
+ default=25,
+ help="End step for linear quadratic schedule for flow matching.",
+ )
+
+ return parser
+
+
+def add_inference_args(parser: argparse.ArgumentParser):
+ group = parser.add_argument_group(title="Inference args")
+
+ # ======================== Model loads ========================
+ group.add_argument(
+ "--model-base",
+ type=str,
+ default="ckpts",
+ help="Root path of all the models, including t2v models and extra models.",
+ )
+ group.add_argument(
+ "--dit-weight",
+ type=str,
+ default="ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt",
+ help="Path to the HunyuanVideo model. If None, search the model in the args.model_root."
+ "1. If it is a file, load the model directly."
+ "2. If it is a directory, search the model in the directory. Support two types of models: "
+ "1) named `pytorch_model_*.pt`"
+ "2) named `*_model_states.pt`, where * can be `mp_rank_00`.",
+ )
+ group.add_argument(
+ "--model-resolution",
+ type=str,
+ default="540p",
+ choices=["540p", "720p"],
+ help="Root path of all the models, including t2v models and extra models.",
+ )
+ group.add_argument(
+ "--load-key",
+ type=str,
+ default="module",
+ help="Key to load the model states. 'module' for the main model, 'ema' for the EMA model.",
+ )
+ group.add_argument(
+ "--use-cpu-offload",
+ action="store_true",
+ help="Use CPU offload for the model load.",
+ )
+
+ # ======================== Inference general setting ========================
+ group.add_argument(
+ "--batch-size",
+ type=int,
+ default=1,
+ help="Batch size for inference and evaluation.",
+ )
+ group.add_argument(
+ "--infer-steps",
+ type=int,
+ default=50,
+ help="Number of denoising steps for inference.",
+ )
+ group.add_argument(
+ "--disable-autocast",
+ action="store_true",
+ help="Disable autocast for denoising loop and vae decoding in pipeline sampling.",
+ )
+ group.add_argument(
+ "--save-path",
+ type=str,
+ default="./results",
+ help="Path to save the generated samples.",
+ )
+ group.add_argument(
+ "--save-path-suffix",
+ type=str,
+ default="",
+ help="Suffix for the directory of saved samples.",
+ )
+ group.add_argument(
+ "--name-suffix",
+ type=str,
+ default="",
+ help="Suffix for the names of saved samples.",
+ )
+ group.add_argument(
+ "--num-videos",
+ type=int,
+ default=1,
+ help="Number of videos to generate for each prompt.",
+ )
+ # ---sample size---
+ group.add_argument(
+ "--video-size",
+ type=int,
+ nargs="+",
+ default=(720, 1280),
+ help="Video size for training. If a single value is provided, it will be used for both height "
+ "and width. If two values are provided, they will be used for height and width "
+ "respectively.",
+ )
+ group.add_argument(
+ "--video-length",
+ type=int,
+ default=129,
+ help="How many frames to sample from a video. if using 3d vae, the number should be 4n+1",
+ )
+ # --- prompt ---
+ group.add_argument(
+ "--prompt",
+ type=str,
+ default=None,
+ help="Prompt for sampling during evaluation.",
+ )
+ group.add_argument(
+ "--seed-type",
+ type=str,
+ default="auto",
+ choices=["file", "random", "fixed", "auto"],
+ help="Seed type for evaluation. If file, use the seed from the CSV file. If random, generate a "
+ "random seed. If fixed, use the fixed seed given by `--seed`. If auto, `csv` will use the "
+ "seed column if available, otherwise use the fixed `seed` value. `prompt` will use the "
+ "fixed `seed` value.",
+ )
+ group.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
+
+ # Classifier-Free Guidance
+ group.add_argument(
+ "--neg-prompt", type=str, default=None, help="Negative prompt for sampling."
+ )
+ group.add_argument(
+ "--cfg-scale", type=float, default=1.0, help="Classifier free guidance scale."
+ )
+ group.add_argument(
+ "--embedded-cfg-scale",
+ type=float,
+ default=6.0,
+ help="Embeded classifier free guidance scale.",
+ )
+
+ group.add_argument(
+ "--reproduce",
+ action="store_true",
+ help="Enable reproducibility by setting random seeds and deterministic algorithms.",
+ )
+
+ return parser
+
+
+def add_parallel_args(parser: argparse.ArgumentParser):
+ group = parser.add_argument_group(title="Parallel args")
+
+ # ======================== Model loads ========================
+ group.add_argument(
+ "--ulysses-degree",
+ type=int,
+ default=1,
+ help="Ulysses degree.",
+ )
+ group.add_argument(
+ "--ring-degree",
+ type=int,
+ default=1,
+ help="Ulysses degree.",
+ )
+
+ return parser
+
+
+def sanity_check_args(args):
+ # VAE channels
+ vae_pattern = r"\d{2,3}-\d{1,2}c-\w+"
+ if not re.match(vae_pattern, args.vae):
+ raise ValueError(
+ f"Invalid VAE model: {args.vae}. Must be in the format of '{vae_pattern}'."
+ )
+ vae_channels = int(args.vae.split("-")[1][:-1])
+ if args.latent_channels is None:
+ args.latent_channels = vae_channels
+ if vae_channels != args.latent_channels:
+ raise ValueError(
+ f"Latent channels ({args.latent_channels}) must match the VAE channels ({vae_channels})."
+ )
+ return args
diff --git a/hyvideo/constants.py b/hyvideo/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ad805937f8f5e14ee4b19b40ad482498e7ded6a
--- /dev/null
+++ b/hyvideo/constants.py
@@ -0,0 +1,164 @@
+import os
+import torch
+
+__all__ = [
+ "C_SCALE",
+ "PROMPT_TEMPLATE",
+ "MODEL_BASE",
+ "PRECISIONS",
+ "NORMALIZATION_TYPE",
+ "ACTIVATION_TYPE",
+ "VAE_PATH",
+ "TEXT_ENCODER_PATH",
+ "TOKENIZER_PATH",
+ "TEXT_PROJECTION",
+ "DATA_TYPE",
+ "NEGATIVE_PROMPT",
+ "NEGATIVE_PROMPT_I2V",
+ "FLOW_PATH_TYPE",
+ "FLOW_PREDICT_TYPE",
+ "FLOW_LOSS_WEIGHT",
+ "FLOW_SNR_TYPE",
+ "FLOW_SOLVER",
+]
+
+PRECISION_TO_TYPE = {
+ 'fp32': torch.float32,
+ 'fp16': torch.float16,
+ 'bf16': torch.bfloat16,
+}
+
+# =================== Constant Values =====================
+# Computation scale factor, 1P = 1_000_000_000_000_000. Tensorboard will display the value in PetaFLOPS to avoid
+# overflow error when tensorboard logging values.
+C_SCALE = 1_000_000_000_000_000
+
+# When using decoder-only models, we must provide a prompt template to instruct the text encoder
+# on how to generate the text.
+# --------------------------------------------------------------------
+PROMPT_TEMPLATE_ENCODE = (
+ "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
+ "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+)
+PROMPT_TEMPLATE_ENCODE_VIDEO = (
+ "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
+ "1. The main content and theme of the video."
+ "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
+ "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
+ "4. background environment, light, style and atmosphere."
+ "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+)
+
+PROMPT_TEMPLATE_ENCODE_I2V = (
+ "<|start_header_id|>system<|end_header_id|>\n\n\nDescribe the image by detailing the color, shape, size, texture, "
+ "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+ "<|start_header_id|>assistant<|end_header_id|>\n\n"
+)
+
+PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
+ "<|start_header_id|>system<|end_header_id|>\n\n\nDescribe the video by detailing the following aspects according to the reference image: "
+ "1. The main content and theme of the video."
+ "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
+ "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
+ "4. background environment, light, style and atmosphere."
+ "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n"
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+ "<|start_header_id|>assistant<|end_header_id|>\n\n"
+)
+
+NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
+NEGATIVE_PROMPT_I2V = "deformation, a poor composition and deformed video, bad teeth, bad eyes, bad limbs"
+
+PROMPT_TEMPLATE = {
+ "dit-llm-encode": {
+ "template": PROMPT_TEMPLATE_ENCODE,
+ "crop_start": 36,
+ },
+ "dit-llm-encode-video": {
+ "template": PROMPT_TEMPLATE_ENCODE_VIDEO,
+ "crop_start": 95,
+ },
+ "dit-llm-encode-i2v": {
+ "template": PROMPT_TEMPLATE_ENCODE_I2V,
+ "crop_start": 36,
+ "image_emb_start": 5,
+ "image_emb_end": 581,
+ "image_emb_len": 576,
+ "double_return_token_id": 271
+ },
+ "dit-llm-encode-video-i2v": {
+ "template": PROMPT_TEMPLATE_ENCODE_VIDEO_I2V,
+ "crop_start": 103,
+ "image_emb_start": 5,
+ "image_emb_end": 581,
+ "image_emb_len": 576,
+ "double_return_token_id": 271
+ },
+}
+
+# ======================= Model ======================
+PRECISIONS = {"fp32", "fp16", "bf16"}
+NORMALIZATION_TYPE = {"layer", "rms"}
+ACTIVATION_TYPE = {"relu", "silu", "gelu", "gelu_tanh"}
+
+# =================== Model Path =====================
+MODEL_BASE = os.getenv("MODEL_BASE", "./ckpts")
+
+# =================== Data =======================
+DATA_TYPE = {"image", "video", "image_video"}
+
+# 3D VAE
+VAE_PATH = {"884-16c-hy": f"{MODEL_BASE}/hunyuan-video-t2v-720p/vae"}
+
+# Text Encoder
+TEXT_ENCODER_PATH = {
+ "clipL": f"{MODEL_BASE}/clip_vit_large_patch14",
+ "llm": f"{MODEL_BASE}/llava-llama-3-8b",
+ "llm-i2v": f"{MODEL_BASE}/llava-llama-3-8b",
+}
+
+# Tokenizer
+TOKENIZER_PATH = {
+ "clipL": f"{MODEL_BASE}/clip_vit_large_patch14",
+ "llm": f"{MODEL_BASE}/llava-llama-3-8b",
+ "llm-i2v": f"{MODEL_BASE}/llava-llama-3-8b",
+}
+
+TEXT_PROJECTION = {
+ "linear", # Default, an nn.Linear() layer
+ "single_refiner", # Single TokenRefiner. Refer to LI-DiT
+}
+
+# Flow Matching path type
+FLOW_PATH_TYPE = {
+ "linear", # Linear trajectory between noise and data
+ "gvp", # Generalized variance-preserving SDE
+ "vp", # Variance-preserving SDE
+}
+
+# Flow Matching predict type
+FLOW_PREDICT_TYPE = {
+ "velocity", # Predict velocity
+ "score", # Predict score
+ "noise", # Predict noise
+}
+
+# Flow Matching loss weight
+FLOW_LOSS_WEIGHT = {
+ "velocity", # Weight loss by velocity
+ "likelihood", # Weight loss by likelihood
+}
+
+# Flow Matching SNR type
+FLOW_SNR_TYPE = {
+ "lognorm", # Log-normal SNR
+ "uniform", # Uniform SNR
+}
+
+# Flow Matching solvers
+FLOW_SOLVER = {
+ "euler", # Euler solver
+}
\ No newline at end of file
diff --git a/hyvideo/diffusion/__init__.py b/hyvideo/diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2141aa3dccb5a6b231bf2f3ae6ab864152ffc3ec
--- /dev/null
+++ b/hyvideo/diffusion/__init__.py
@@ -0,0 +1,2 @@
+from .pipelines import HunyuanVideoPipeline
+from .schedulers import FlowMatchDiscreteScheduler
diff --git a/hyvideo/diffusion/pipelines/__init__.py b/hyvideo/diffusion/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e44cb6196fe7f6a7fa821b2bebddf8d1117521fc
--- /dev/null
+++ b/hyvideo/diffusion/pipelines/__init__.py
@@ -0,0 +1 @@
+from .pipeline_hunyuan_video import HunyuanVideoPipeline
diff --git a/hyvideo/diffusion/pipelines/pipeline_hunyuan_video.py b/hyvideo/diffusion/pipelines/pipeline_hunyuan_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..9261cf1b60e2404546516e77bd1fa0de0d839f97
--- /dev/null
+++ b/hyvideo/diffusion/pipelines/pipeline_hunyuan_video.py
@@ -0,0 +1,1419 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import torch
+import torch.distributed as dist
+import numpy as np
+from dataclasses import dataclass
+from packaging import version
+
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.utils import BaseOutput
+from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+ USE_PEFT_BACKEND,
+ deprecate,
+ logging,
+ replace_example_docstring,
+ scale_lora_layers,
+ unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import BaseOutput
+
+from ...constants import PRECISION_TO_TYPE
+from ...vae.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from ...text_encoder import TextEncoder
+from ...modules import HYVideoDiffusionTransformer
+from mmgp import offload
+from ...utils.data_utils import black_image
+from einops import rearrange
+
+EXAMPLE_DOC_STRING = """"""
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+ """
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+ """
+ std_text = noise_pred_text.std(
+ dim=list(range(1, noise_pred_text.ndim)), keepdim=True
+ )
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+ # rescale the results from guidance (fixes overexposure)
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+ noise_cfg = (
+ guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+ )
+ return noise_cfg
+
+
+def retrieve_timesteps(
+ scheduler,
+ num_inference_steps: Optional[int] = None,
+ device: Optional[Union[str, torch.device]] = None,
+ timesteps: Optional[List[int]] = None,
+ sigmas: Optional[List[float]] = None,
+ **kwargs,
+):
+ """
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+ Args:
+ scheduler (`SchedulerMixin`):
+ The scheduler to get timesteps from.
+ num_inference_steps (`int`):
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+ must be `None`.
+ device (`str` or `torch.device`, *optional*):
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+ timesteps (`List[int]`, *optional*):
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+ `num_inference_steps` and `sigmas` must be `None`.
+ sigmas (`List[float]`, *optional*):
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+ `num_inference_steps` and `timesteps` must be `None`.
+
+ Returns:
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+ second element is the number of inference steps.
+ """
+ if timesteps is not None and sigmas is not None:
+ raise ValueError(
+ "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+ )
+ if timesteps is not None:
+ accepts_timesteps = "timesteps" in set(
+ inspect.signature(scheduler.set_timesteps).parameters.keys()
+ )
+ if not accepts_timesteps:
+ raise ValueError(
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+ f" timestep schedules. Please check whether you are using the correct scheduler."
+ )
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+ timesteps = scheduler.timesteps
+ num_inference_steps = len(timesteps)
+ elif sigmas is not None:
+ accept_sigmas = "sigmas" in set(
+ inspect.signature(scheduler.set_timesteps).parameters.keys()
+ )
+ if not accept_sigmas:
+ raise ValueError(
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
+ )
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+ timesteps = scheduler.timesteps
+ num_inference_steps = len(timesteps)
+ else:
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+ timesteps = scheduler.timesteps
+ return timesteps, num_inference_steps
+
+
+@dataclass
+class HunyuanVideoPipelineOutput(BaseOutput):
+ videos: Union[torch.Tensor, np.ndarray]
+
+
+class HunyuanVideoPipeline(DiffusionPipeline):
+ r"""
+ Pipeline for text-to-video generation using HunyuanVideo.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+ text_encoder ([`TextEncoder`]):
+ Frozen text-encoder.
+ text_encoder_2 ([`TextEncoder`]):
+ Frozen text-encoder_2.
+ transformer ([`HYVideoDiffusionTransformer`]):
+ A `HYVideoDiffusionTransformer` to denoise the encoded video latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+ """
+
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+ _optional_components = ["text_encoder_2"]
+ _exclude_from_cpu_offload = ["transformer"]
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: TextEncoder,
+ transformer: HYVideoDiffusionTransformer,
+ scheduler: KarrasDiffusionSchedulers,
+ text_encoder_2: Optional[TextEncoder] = None,
+ progress_bar_config: Dict[str, Any] = None,
+ args=None,
+ ):
+ super().__init__()
+
+ # ==========================================================================================
+ if progress_bar_config is None:
+ progress_bar_config = {}
+ if not hasattr(self, "_progress_bar_config"):
+ self._progress_bar_config = {}
+ self._progress_bar_config.update(progress_bar_config)
+
+ self.args = args
+ # ==========================================================================================
+
+ if (
+ hasattr(scheduler.config, "steps_offset")
+ and scheduler.config.steps_offset != 1
+ ):
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+ " file"
+ )
+ deprecate(
+ "steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False
+ )
+ new_config = dict(scheduler.config)
+ new_config["steps_offset"] = 1
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if (
+ hasattr(scheduler.config, "clip_sample")
+ and scheduler.config.clip_sample is True
+ ):
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+ )
+ deprecate(
+ "clip_sample not set", "1.0.0", deprecation_message, standard_warn=False
+ )
+ new_config = dict(scheduler.config)
+ new_config["clip_sample"] = False
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ transformer=transformer,
+ scheduler=scheduler,
+ text_encoder_2=text_encoder_2,
+ )
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+ self.noise_pertub = 0
+
+ def encode_prompt(
+ self,
+ prompt,
+ name,
+ device,
+ num_videos_per_prompt,
+ do_classifier_free_guidance,
+ negative_prompt=None,
+ pixel_value_llava: Optional[torch.Tensor] = None,
+ uncond_pixel_value_llava: Optional[torch.Tensor] = None,
+ prompt_embeds: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
+ negative_attention_mask: Optional[torch.Tensor] = None,
+ lora_scale: Optional[float] = None,
+ clip_skip: Optional[int] = None,
+ text_encoder: Optional[TextEncoder] = None,
+ data_type: Optional[str] = "image",
+ semantic_images=None
+ ):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ prompt to be encoded
+ device: (`torch.device`):
+ torch device
+ num_videos_per_prompt (`int`):
+ number of videos that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the video generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ pixel_value_llava (`torch.Tensor`, *optional*):
+ The image tensor for llava.
+ uncond_pixel_value_llava (`torch.Tensor`, *optional*):
+ The image tensor for llava. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ prompt_embeds (`torch.Tensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ attention_mask (`torch.Tensor`, *optional*):
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+ argument.
+ negative_attention_mask (`torch.Tensor`, *optional*):
+ lora_scale (`float`, *optional*):
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+ clip_skip (`int`, *optional*):
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+ the output of the pre-final layer will be used for computing the prompt embeddings.
+ text_encoder (TextEncoder, *optional*):
+ data_type (`str`, *optional*):
+ """
+ if text_encoder is None:
+ text_encoder = self.text_encoder
+
+ # set lora scale so that monkey patched LoRA
+ # function of text encoder can correctly access it
+ if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+ self._lora_scale = lora_scale
+
+ # dynamically adjust the LoRA scale
+ if not USE_PEFT_BACKEND:
+ adjust_lora_scale_text_encoder(text_encoder.model, lora_scale)
+ else:
+ scale_lora_layers(text_encoder.model, lora_scale)
+
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ if prompt_embeds is None:
+ # textual inversion: process multi-vector tokens if necessary
+ if isinstance(self, TextualInversionLoaderMixin):
+ prompt = self.maybe_convert_prompt(prompt, text_encoder.tokenizer)
+
+ text_inputs = text_encoder.text2tokens(prompt, data_type=data_type, name = name)
+
+ if pixel_value_llava is not None:
+ text_inputs['pixel_value_llava'] = pixel_value_llava
+ text_inputs['attention_mask'] = torch.cat([text_inputs['attention_mask'], torch.ones((1, 575 * len(pixel_value_llava))).to(text_inputs['attention_mask'])], dim=1)
+
+ if clip_skip is None:
+ prompt_outputs = text_encoder.encode(
+ text_inputs, data_type=data_type, semantic_images=semantic_images, device=device
+ )
+ prompt_embeds = prompt_outputs.hidden_state
+ else:
+ prompt_outputs = text_encoder.encode(
+ text_inputs,
+ output_hidden_states=True,
+ data_type=data_type,
+ semantic_images=semantic_images,
+ device=device,
+ )
+ # Access the `hidden_states` first, that contains a tuple of
+ # all the hidden states from the encoder layers. Then index into
+ # the tuple to access the hidden states from the desired layer.
+ prompt_embeds = prompt_outputs.hidden_states_list[-(clip_skip + 1)]
+ # We also need to apply the final LayerNorm here to not mess with the
+ # representations. The `last_hidden_states` that we typically use for
+ # obtaining the final prompt representations passes through the LayerNorm
+ # layer.
+ prompt_embeds = text_encoder.model.text_model.final_layer_norm(
+ prompt_embeds
+ )
+
+ attention_mask = prompt_outputs.attention_mask
+ if attention_mask is not None:
+ attention_mask = attention_mask.to(device)
+ bs_embed, seq_len = attention_mask.shape
+ attention_mask = attention_mask.repeat(1, num_videos_per_prompt)
+ attention_mask = attention_mask.view(
+ bs_embed * num_videos_per_prompt, seq_len
+ )
+
+ if text_encoder is not None:
+ prompt_embeds_dtype = text_encoder.dtype
+ elif self.transformer is not None:
+ prompt_embeds_dtype = self.transformer.dtype
+ else:
+ prompt_embeds_dtype = prompt_embeds.dtype
+
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+ if prompt_embeds.ndim == 2:
+ bs_embed, _ = prompt_embeds.shape
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt)
+ prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, -1)
+ else:
+ bs_embed, seq_len, _ = prompt_embeds.shape
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+ prompt_embeds = prompt_embeds.view(
+ bs_embed * num_videos_per_prompt, seq_len, -1
+ )
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt]
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ # textual inversion: process multi-vector tokens if necessary
+ if isinstance(self, TextualInversionLoaderMixin):
+ uncond_tokens = self.maybe_convert_prompt(
+ uncond_tokens, text_encoder.tokenizer
+ )
+
+ # max_length = prompt_embeds.shape[1]
+ uncond_input = text_encoder.text2tokens(uncond_tokens, data_type=data_type, name = name)
+
+ if semantic_images is not None:
+ uncond_image = [black_image(img.size[0], img.size[1]) for img in semantic_images]
+ else:
+ uncond_image = None
+
+ if uncond_pixel_value_llava is not None:
+ uncond_input['pixel_value_llava'] = uncond_pixel_value_llava
+ uncond_input['attention_mask'] = torch.cat([uncond_input['attention_mask'], torch.ones((1, 575 * len(uncond_pixel_value_llava))).to(uncond_input['attention_mask'])], dim=1)
+
+ negative_prompt_outputs = text_encoder.encode(
+ uncond_input, data_type=data_type, semantic_images=uncond_image, device=device
+ )
+ negative_prompt_embeds = negative_prompt_outputs.hidden_state
+
+ negative_attention_mask = negative_prompt_outputs.attention_mask
+ if negative_attention_mask is not None:
+ negative_attention_mask = negative_attention_mask.to(device)
+ _, seq_len = negative_attention_mask.shape
+ negative_attention_mask = negative_attention_mask.repeat(
+ 1, num_videos_per_prompt
+ )
+ negative_attention_mask = negative_attention_mask.view(
+ batch_size * num_videos_per_prompt, seq_len
+ )
+
+ if do_classifier_free_guidance:
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = negative_prompt_embeds.shape[1]
+
+ negative_prompt_embeds = negative_prompt_embeds.to(
+ dtype=prompt_embeds_dtype, device=device
+ )
+
+ if negative_prompt_embeds.ndim == 2:
+ negative_prompt_embeds = negative_prompt_embeds.repeat(
+ 1, num_videos_per_prompt
+ )
+ negative_prompt_embeds = negative_prompt_embeds.view(
+ batch_size * num_videos_per_prompt, -1
+ )
+ else:
+ negative_prompt_embeds = negative_prompt_embeds.repeat(
+ 1, num_videos_per_prompt, 1
+ )
+ negative_prompt_embeds = negative_prompt_embeds.view(
+ batch_size * num_videos_per_prompt, seq_len, -1
+ )
+
+ if text_encoder is not None:
+ if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+ # Retrieve the original scale by scaling back the LoRA layers
+ unscale_lora_layers(text_encoder.model, lora_scale)
+
+ return (
+ prompt_embeds,
+ negative_prompt_embeds,
+ attention_mask,
+ negative_attention_mask,
+ )
+
+ def decode_latents(self, latents, enable_tiling=True):
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+ latents = 1 / self.vae.config.scaling_factor * latents
+ if enable_tiling:
+ self.vae.enable_tiling()
+ image = self.vae.decode(latents, return_dict=False)[0]
+ else:
+ image = self.vae.decode(latents, return_dict=False)[0]
+ image = (image / 2 + 0.5).clamp(0, 1)
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+ if image.ndim == 4:
+ image = image.cpu().permute(0, 2, 3, 1).float()
+ else:
+ image = image.cpu().float()
+ return image
+
+ def prepare_extra_func_kwargs(self, func, kwargs):
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+ extra_step_kwargs = {}
+
+ for k, v in kwargs.items():
+ accepts = k in set(inspect.signature(func).parameters.keys())
+ if accepts:
+ extra_step_kwargs[k] = v
+ return extra_step_kwargs
+
+ def check_inputs(
+ self,
+ prompt,
+ height,
+ width,
+ video_length,
+ callback_steps,
+ pixel_value_llava=None,
+ uncond_pixel_value_llava=None,
+ negative_prompt=None,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ callback_on_step_end_tensor_inputs=None,
+ vae_ver="88-4c-sd",
+ ):
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(
+ f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+ )
+
+ if video_length is not None:
+ if "884" in vae_ver:
+ if video_length != 1 and (video_length - 1) % 4 != 0:
+ raise ValueError(
+ f"`video_length` has to be 1 or a multiple of 4 but is {video_length}."
+ )
+ elif "888" in vae_ver:
+ if video_length != 1 and (video_length - 1) % 8 != 0:
+ raise ValueError(
+ f"`video_length` has to be 1 or a multiple of 8 but is {video_length}."
+ )
+
+ if callback_steps is not None and (
+ not isinstance(callback_steps, int) or callback_steps <= 0
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+ if callback_on_step_end_tensor_inputs is not None and not all(
+ k in self._callback_tensor_inputs
+ for k in callback_on_step_end_tensor_inputs
+ ):
+ raise ValueError(
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+ )
+
+ if prompt is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt is None and prompt_embeds is None:
+ raise ValueError(
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+ )
+ elif prompt is not None and (
+ not isinstance(prompt, str) and not isinstance(prompt, list)
+ ):
+ raise ValueError(
+ f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+ )
+
+ if negative_prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+
+ if pixel_value_llava is not None and uncond_pixel_value_llava is not None:
+ if len(pixel_value_llava) != len(uncond_pixel_value_llava):
+ raise ValueError(
+ "`pixel_value_llava` and `uncond_pixel_value_llava` must have the same length when passed directly, but"
+ f" got: `pixel_value_llava` {len(pixel_value_llava)} != `uncond_pixel_value_llava`"
+ f" {len(uncond_pixel_value_llava)}."
+ )
+
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
+ raise ValueError(
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+ f" {negative_prompt_embeds.shape}."
+ )
+
+ def get_timesteps(self, num_inference_steps, strength, device):
+ # get the original timestep using init_timestep
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+ t_start = max(num_inference_steps - init_timestep, 0)
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+ if hasattr(self.scheduler, "set_begin_index"):
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+ return timesteps.to(device), num_inference_steps - t_start
+
+
+ def prepare_latents(
+ self,
+ batch_size,
+ num_channels_latents,
+ num_inference_steps,
+ height,
+ width,
+ video_length,
+ dtype,
+ device,
+ timesteps,
+ generator,
+ latents=None,
+ denoise_strength=1.0,
+ img_latents=None,
+ i2v_mode=False,
+ i2v_condition_type=None,
+ i2v_stability=True,
+ ):
+ if i2v_mode and i2v_condition_type == "latent_concat":
+ num_channels_latents = (num_channels_latents - 1) // 2
+ shape = (
+ batch_size,
+ num_channels_latents,
+ video_length,
+ int(height) // self.vae_scale_factor,
+ int(width) // self.vae_scale_factor,
+ )
+ if isinstance(generator, list) and len(generator) != batch_size:
+ raise ValueError(
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+ )
+
+ if i2v_mode and i2v_stability:
+ if img_latents.shape[2] == 1:
+ img_latents = img_latents.repeat(1, 1, video_length, 1, 1)
+ x0 = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+ x1 = img_latents
+
+ t = torch.tensor([0.999]).to(device=device)
+ latents = x0 * t + x1 * (1 - t)
+ latents = latents.to(dtype=dtype)
+
+ if denoise_strength == 0:
+ if latents is None:
+ latents = randn_tensor(
+ shape, generator=generator, device=device, dtype=dtype
+ )
+ else:
+ latents = latents.to(device)
+ original_latents = None
+ noise = None
+ timesteps = timesteps
+ else:
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, denoise_strength, device)
+
+ if latents is None:
+ latents = noise
+ original_latents = None
+ else:
+ latents = latents.to(device)
+ latent_timestep = timesteps[:1]
+ frames_needed = noise.shape[2]
+ current_frames = latents.shape[2]
+
+ if frames_needed > current_frames:
+ repeat_factor = frames_needed - current_frames
+ additional_frame = torch.randn((latents.size(0), latents.size(1),repeat_factor, latents.size(3), latents.size(4)), dtype=latents.dtype, device=latents.device)
+ latents = torch.cat((additional_frame, latents), dim=2)
+ self.additional_frames = repeat_factor
+ elif frames_needed < current_frames:
+ latents = latents[:, :, :frames_needed, :, :]
+
+ original_latents = latents.clone()
+ latents = latents * (1 - latent_timestep / 1000) + latent_timestep / 1000 * noise
+ print(f'debug:latent_timestep={latent_timestep}, latents-size={latents.shape}')
+
+ # Check existence to make it compatible with FlowMatchEulerDiscreteScheduler
+ if hasattr(self.scheduler, "init_noise_sigma"):
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * self.scheduler.init_noise_sigma
+ return latents, original_latents, noise, timesteps
+
+ # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+ def get_guidance_scale_embedding(
+ self,
+ w: torch.Tensor,
+ embedding_dim: int = 512,
+ dtype: torch.dtype = torch.float32,
+ ) -> torch.Tensor:
+ """
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+ Args:
+ w (`torch.Tensor`):
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+ embedding_dim (`int`, *optional*, defaults to 512):
+ Dimension of the embeddings to generate.
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+ Data type of the generated embeddings.
+
+ Returns:
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+ """
+ assert len(w.shape) == 1
+ w = w * 1000.0
+
+ half_dim = embedding_dim // 2
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+ emb = w.to(dtype)[:, None] * emb[None, :]
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+ if embedding_dim % 2 == 1: # zero pad
+ emb = torch.nn.functional.pad(emb, (0, 1))
+ assert emb.shape == (w.shape[0], embedding_dim)
+ return emb
+
+ @property
+ def guidance_scale(self):
+ return self._guidance_scale
+
+ @property
+ def guidance_rescale(self):
+ return self._guidance_rescale
+
+ @property
+ def clip_skip(self):
+ return self._clip_skip
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ @property
+ def do_classifier_free_guidance(self):
+ # return self._guidance_scale > 1 and self.transformer.config.time_cond_proj_dim is None
+ return self._guidance_scale > 1
+
+ @property
+ def cross_attention_kwargs(self):
+ return self._cross_attention_kwargs
+
+ @property
+ def num_timesteps(self):
+ return self._num_timesteps
+
+ @property
+ def interrupt(self):
+ return self._interrupt
+
+ @torch.no_grad()
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
+ def __call__(
+ self,
+ prompt: Union[str, List[str]],
+ height: int,
+ width: int,
+ video_length: int,
+ name: Union[str, List[str]] = None,
+ data_type: str = "video",
+ num_inference_steps: int = 50,
+ timesteps: List[int] = None,
+ sigmas: List[float] = None,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ pixel_value_ref=None,
+ # ref_latents: Optional[torch.Tensor] = None,
+ # uncond_ref_latents: Optional[torch.Tensor] = None,
+ pixel_value_llava: Optional[torch.Tensor] = None,
+ uncond_pixel_value_llava: Optional[torch.Tensor] = None,
+ ip_cfg_scale: float = 0.0,
+ use_deepcache: int = 1,
+ num_videos_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.Tensor] = None,
+ prompt_embeds: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
+ negative_attention_mask: Optional[torch.Tensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ guidance_rescale: float = 0.0,
+ clip_skip: Optional[int] = None,
+ callback_on_step_end: Optional[
+ Union[
+ Callable[[int, int, Dict], None],
+ PipelineCallback,
+ MultiPipelineCallbacks,
+ ]
+ ] = None,
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+ freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+ vae_ver: str = "88-4c-sd",
+ enable_tiling: bool = False,
+ n_tokens: Optional[int] = None,
+ video_val_flag: bool=False,
+ denoise_strength: float = 1.0,
+ mask = None,
+ embedded_guidance_scale: Optional[float] = None,
+ i2v_mode: bool = False,
+ i2v_condition_type: str = None,
+ i2v_stability: bool = True,
+ img_latents: Optional[torch.Tensor] = None,
+ semantic_images=None,
+ joint_pass = False,
+ cfg_star_rescale = False,
+ callback = None,
+ **kwargs,
+ ):
+ r"""
+ The call function to the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`):
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+ height (`int`):
+ The height in pixels of the generated image.
+ width (`int`):
+ The width in pixels of the generated image.
+ video_length (`int`):
+ The number of frames in the generated video.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ timesteps (`List[int]`, *optional*):
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+ passed will be used. Must be in descending order.
+ sigmas (`List[float]`, *optional*):
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+ will be used.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ A higher guidance scale value encourages the model to generate images closely linked to the text
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+ ref_latents (`torch.Tensor`, *optional*):
+ The image tensor for time-concat.
+ uncond_ref_latents (`torch.Tensor`, *optional*):
+ The image tensor for time-concat. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ pixel_value_llava (`torch.Tensor`, *optional*):
+ The image tensor for llava.
+ uncond_pixel_value_llava (`torch.Tensor`, *optional*):
+ The image tensor for llava. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ num_videos_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+ generation deterministic.
+ latents (`torch.Tensor`, *optional*):
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor is generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.Tensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+ provided, text embeddings are generated from the `prompt` input argument.
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`HunyuanVideoPipelineOutput`] instead of a
+ plain tuple.
+ cross_attention_kwargs (`dict`, *optional*):
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
+ Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+ using zero terminal SNR.
+ clip_skip (`int`, *optional*):
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+ the output of the pre-final layer will be used for computing the prompt embeddings.
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+ `._callback_tensor_inputs` attribute of your pipeline class.
+
+ Examples:
+
+ Returns:
+ [`~HunyuanVideoPipelineOutput`] or `tuple`:
+ If `return_dict` is `True`, [`HunyuanVideoPipelineOutput`] is returned,
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
+ "not-safe-for-work" (nsfw) content.
+ """
+ callback_steps = kwargs.pop("callback_steps", None)
+
+ # if callback is not None:
+ # deprecate(
+ # "callback",
+ # "1.0.0",
+ # "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+ # )
+ # if callback_steps is not None:
+ # deprecate(
+ # "callback_steps",
+ # "1.0.0",
+ # "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+ # )
+
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+ if pixel_value_ref != None:
+ pixel_value_ref = pixel_value_ref * 2 - 1.
+ pixel_value_ref_for_vae = rearrange(pixel_value_ref,"b c h w -> b c 1 h w")
+
+ ref_latents = self.vae.encode(pixel_value_ref_for_vae.clone()).latent_dist.sample()
+ uncond_ref_latents = self.vae.encode(torch.ones_like(pixel_value_ref_for_vae)).latent_dist.sample()
+ ref_latents.mul_(self.vae.config.scaling_factor)
+ uncond_ref_latents.mul_(self.vae.config.scaling_factor)
+ else:
+ ref_latents = None
+ uncond_ref_latents = None
+
+
+ # 0. Default height and width to unet
+ # height = height or self.transformer.config.sample_size * self.vae_scale_factor
+ # width = width or self.transformer.config.sample_size * self.vae_scale_factor
+ # to deal with lora scaling and other possible forward hooks
+ trans = self.transformer
+ if trans.enable_teacache:
+ teacache_multiplier = trans.teacache_multiplier
+ trans.accumulated_rel_l1_distance = 0
+ trans.rel_l1_thresh = 0.1 if teacache_multiplier < 2 else 0.15
+ # trans.teacache_start_step = int(tea_cache_start_step_perc*num_inference_steps/100)
+ # 1. Check inputs. Raise error if not correct
+ self.check_inputs(
+ prompt,
+ height,
+ width,
+ video_length,
+ callback_steps,
+ negative_prompt,
+ pixel_value_llava,
+ uncond_pixel_value_llava,
+ prompt_embeds,
+ negative_prompt_embeds,
+ callback_on_step_end_tensor_inputs,
+ vae_ver=vae_ver,
+ )
+
+ self._guidance_scale = guidance_scale
+ self._guidance_rescale = guidance_rescale
+ self._clip_skip = clip_skip
+ self._cross_attention_kwargs = cross_attention_kwargs
+ self._interrupt = False
+
+ # 2. Define call parameters
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ device = torch.device(f"cuda:{dist.get_rank()}") if dist.is_initialized() else self._execution_device
+
+ # 3. Encode input prompt
+ lora_scale = (
+ self.cross_attention_kwargs.get("scale", None)
+ if self.cross_attention_kwargs is not None
+ else None
+ )
+
+ (
+ prompt_embeds,
+ negative_prompt_embeds,
+ prompt_mask,
+ negative_prompt_mask,
+ ) = self.encode_prompt(
+ prompt,
+ name,
+ device,
+ num_videos_per_prompt,
+ self.do_classifier_free_guidance,
+ negative_prompt,
+ pixel_value_llava=pixel_value_llava,
+ uncond_pixel_value_llava=uncond_pixel_value_llava,
+ prompt_embeds=prompt_embeds,
+ attention_mask=attention_mask,
+ negative_prompt_embeds=negative_prompt_embeds,
+ negative_attention_mask=negative_attention_mask,
+ lora_scale=lora_scale,
+ clip_skip=self.clip_skip,
+ data_type=data_type,
+ semantic_images=semantic_images
+ )
+ if self.text_encoder_2 is not None:
+ (
+ prompt_embeds_2,
+ negative_prompt_embeds_2,
+ prompt_mask_2,
+ negative_prompt_mask_2,
+ ) = self.encode_prompt(
+ prompt,
+ name,
+ device,
+ num_videos_per_prompt,
+ self.do_classifier_free_guidance,
+ negative_prompt,
+ prompt_embeds=None,
+ attention_mask=None,
+ negative_prompt_embeds=None,
+ negative_attention_mask=None,
+ lora_scale=lora_scale,
+ clip_skip=self.clip_skip,
+ text_encoder=self.text_encoder_2,
+ data_type=data_type,
+ )
+ else:
+ prompt_embeds_2 = None
+ negative_prompt_embeds_2 = None
+ prompt_mask_2 = None
+ negative_prompt_mask_2 = None
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ if self.do_classifier_free_guidance:
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+ if prompt_mask is not None:
+ prompt_mask = torch.cat([negative_prompt_mask, prompt_mask])
+ if prompt_embeds_2 is not None:
+ prompt_embeds_2 = torch.cat([negative_prompt_embeds_2, prompt_embeds_2])
+ if prompt_mask_2 is not None:
+ prompt_mask_2 = torch.cat([negative_prompt_mask_2, prompt_mask_2])
+
+ if self.do_classifier_free_guidance:
+ if ref_latents is not None:
+ ref_latents = torch.cat([ref_latents, ref_latents], dim=0)
+ if prompt_mask[0].sum() > 575:
+ prompt_mask[0] = torch.cat([torch.ones((1, prompt_mask[0].sum() - 575)).to(prompt_mask),
+ torch.zeros((1, prompt_mask.shape[1] - prompt_mask[0].sum() + 575)).to(prompt_mask)], dim=1)
+
+ if ip_cfg_scale>0:
+ prompt_embeds = torch.cat([prompt_embeds, prompt_embeds[1:]])
+ prompt_embeds_2 = torch.cat([prompt_embeds_2, prompt_embeds_2[1:]])
+ prompt_mask = torch.cat([prompt_mask, prompt_mask[1:]], dim=0)
+ ref_latents = torch.cat([uncond_ref_latents, uncond_ref_latents, ref_latents[1:]], dim=0)
+
+
+ # 4. Prepare timesteps
+ extra_set_timesteps_kwargs = self.prepare_extra_func_kwargs(
+ self.scheduler.set_timesteps, {"n_tokens": n_tokens}
+ )
+ timesteps, num_inference_steps = retrieve_timesteps(
+ self.scheduler,
+ num_inference_steps,
+ device,
+ timesteps,
+ sigmas,
+ **extra_set_timesteps_kwargs,
+ )
+
+ if "884" in vae_ver:
+ video_length = (video_length - 1) // 4 + 1
+ elif "888" in vae_ver:
+ video_length = (video_length - 1) // 8 + 1
+ else:
+ video_length = video_length
+
+ if self.transformer.mixed_precision:
+ latent_dtype = torch.float32
+ else:
+ latent_dtype = torch.bfloat16
+ if prompt_embeds != None:
+ prompt_embeds = prompt_embeds.to(torch.bfloat16)
+ if prompt_embeds_2 != None:
+ prompt_embeds_2 = prompt_embeds_2.to(torch.bfloat16)
+ # if prompt_mask != None:
+ # prompt_mask = prompt_mask.to(torch.bfloat16)
+ # 5. Prepare latent variables
+ num_channels_latents = self.transformer.config.in_channels
+ latents, original_latents, noise, timesteps = self.prepare_latents(
+ batch_size * num_videos_per_prompt,
+ num_channels_latents,
+ num_inference_steps,
+ height,
+ width,
+ video_length,
+ latent_dtype, #prompt_embeds.dtype,
+ device,
+ timesteps,
+ generator,
+ latents,
+ denoise_strength,
+ img_latents=img_latents,
+ i2v_mode=i2v_mode,
+ i2v_condition_type=i2v_condition_type,
+ i2v_stability=i2v_stability
+ )
+
+ if i2v_mode and i2v_condition_type == "latent_concat":
+ if img_latents.shape[2] == 1:
+ img_latents_concat = img_latents.repeat(1, 1, video_length, 1, 1)
+ else:
+ img_latents_concat = img_latents
+ img_latents_concat[:, :, 1:, ...] = 0
+
+ i2v_mask = torch.zeros(video_length)
+ i2v_mask[0] = 1
+
+ mask_concat = torch.ones(img_latents_concat.shape[0], 1, img_latents_concat.shape[2], img_latents_concat.shape[3],
+ img_latents_concat.shape[4]).to(device=img_latents.device)
+ mask_concat[:, :, 1:, ...] = 0
+
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_func_kwargs(
+ self.scheduler.step,
+ {"generator": generator, "eta": eta},
+ )
+
+ vae_precision = "fp16" # torch.float16
+ precision = "bf16" # torch.bfloat16
+
+ disable_autocast = True
+
+ target_dtype = PRECISION_TO_TYPE[precision]
+ autocast_enabled = target_dtype != torch.float32 and not disable_autocast
+ vae_dtype = PRECISION_TO_TYPE[vae_precision]
+ vae_autocast_enabled = vae_dtype != torch.float32 and not disable_autocast
+
+ # 7. Denoising loop
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+ self._num_timesteps = len(timesteps)
+ start_scale = ip_cfg_scale # 3.0
+ end_scale = 1.0
+ step_scale = (start_scale - end_scale) / (self._num_timesteps - 1 + 1e-3)
+
+ # print('sigmas used in generation:', self.scheduler.sigmas)
+ # print('inference timesteps used in generation:', timesteps)
+
+
+ # 8. Mask latents
+ mask_latents = None
+ if mask is not None:
+ target_video_length = mask.shape[0]
+ target_height = mask.shape[1]
+ target_width = mask.shape[2]
+
+ mask_length = (target_video_length - 1) // 4 + 1
+ mask_height = target_height // 8
+ mask_width = target_width // 8
+
+ mask = mask[...,0:1]
+ mask = mask.unsqueeze(0)
+ mask = rearrange(mask, "b t h w c -> b c t h w")
+
+ mask_latents = torch.nn.functional.interpolate(mask, size=(mask_length, mask_height, mask_width))
+ mask_latents = mask_latents.to(device)
+
+ if mask_latents is not None:
+ mask_latents_model_input = (
+ torch.cat([mask_latents] * 2)
+ if self.do_classifier_free_guidance
+ else mask_latents
+ )
+ print(f'maskinfo, mask={mask.shape}, mask_latents_model_input={mask_latents_model_input.shape} ')
+
+
+ if callback != None:
+ callback(-1, None, True)
+
+ load_latent = True
+ load_latent = False
+
+ multi_passes_free_guidance = not joint_pass
+ if load_latent:
+ timesteps = []
+
+ latent_items = 2 if self.do_classifier_free_guidance else 1
+ if ip_cfg_scale>0:
+ latent_items += 1
+
+ if self.transformer.enable_teacache:
+ self.transformer.previous_residual = [None] * latent_items
+
+ # if is_progress_bar:
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ offload.set_step_no_for_lora(self.transformer, i)
+ if self.interrupt:
+ continue
+ if i2v_mode and i2v_condition_type == "token_replace":
+ latents = torch.concat([img_latents, latents[:, :, 1:, :, :]], dim=2)
+
+ # expand the latents if we are doing classifier free guidance
+ if i2v_mode and i2v_condition_type == "latent_concat":
+ latent_model_input = torch.concat([latents, img_latents_concat, mask_concat], dim=1)
+ else:
+ latent_model_input = latents
+
+ latent_model_input = torch.cat([latent_model_input] * latent_items) if latent_items > 1 else latent_model_input
+
+ latent_model_input = self.scheduler.scale_model_input(
+ latent_model_input, t
+ )
+
+ if mask_latents is not None:
+ original_latents_noise = original_latents * (1 - t / 1000.0) + t / 1000.0 * noise
+ original_latent_noise_model_input = (
+ torch.cat([original_latents_noise] * 2)
+ if self.do_classifier_free_guidance
+ else original_latents_noise
+ )
+ original_latent_noise_model_input = self.scheduler.scale_model_input(original_latent_noise_model_input, t)
+ latent_model_input = mask_latents_model_input * latent_model_input + (1 - mask_latents_model_input) * original_latent_noise_model_input
+
+ t_expand = t.repeat(latent_model_input.shape[0])
+ guidance_expand = (
+ torch.tensor(
+ [embedded_guidance_scale] * latent_model_input.shape[0],
+ dtype=torch.float32,
+ device=device,
+ ).to(latent_dtype)
+ * 1000.0
+ if embedded_guidance_scale is not None
+ else None
+ )
+
+ # predict the noise residual
+ with torch.autocast(
+ device_type="cuda", dtype=target_dtype, enabled=autocast_enabled
+ ):
+
+ if self.do_classifier_free_guidance and multi_passes_free_guidance:
+ for j in range(len(latent_model_input)):
+ ret = self.transformer( # For an input image (129, 192, 336) (1, 256, 256)
+ latent_model_input[j].unsqueeze(0), # [2, 16, 33, 24, 42]
+ t_expand[j].unsqueeze(0), # [2]
+ text_states=prompt_embeds[j].unsqueeze(0), # [2, 256, 4096]
+ text_mask=prompt_mask[j].unsqueeze(0), # [2, 256]
+ text_states_2=prompt_embeds_2[j].unsqueeze(0), # [2, 768]
+ ref_latents=ref_latents[j].unsqueeze(0),
+ freqs_cos=freqs_cis[0], # [seqlen, head_dim]
+ freqs_sin=freqs_cis[1], # [seqlen, head_dim]
+ guidance=guidance_expand,
+ pipeline=self,
+ x_id=j,
+ callback = callback,
+ )
+ if self._interrupt:
+ return [None]
+ if j==0:
+ noise_pred_uncond= ret[0]
+ elif j==1:
+ noise_pred_text= ret[0]
+ else:
+ noise_pred_ip = ret[0]
+ ret = None
+ else:
+ # if self.do_classifier_free_guidance:
+ # noise_pred_uncond = self.transformer(latent_model_input[:1], t_expand[:1], ref_latents=ref_latents[:1], text_states=prompt_embeds[:1], text_mask=prompt_mask[:1], text_states_2=prompt_embeds_2[:1], freqs_cos=freqs_cis[0],freqs_sin=freqs_cis[1], guidance=guidance_expand,return_dict=True)['x']
+ # noise_pred_text = self.transformer(latent_model_input[1:], t_expand[1:], ref_latents=ref_latents[1:], text_states=prompt_embeds[1:], text_mask=prompt_mask[1:], text_states_2=prompt_embeds_2[1:], freqs_cos=freqs_cis[0],freqs_sin=freqs_cis[1], guidance=guidance_expand,return_dict=True)['x']
+ # noise_pred = torch.cat([noise_pred_uncond, noise_pred_text], dim=0)
+ # else:
+ ret = self.transformer( # For an input image (129, 192, 336) (1, 256, 256)
+ latent_model_input, # [2, 16, 33, 24, 42]
+ t_expand, # [2]
+ text_states=prompt_embeds, # [2, 256, 4096]
+ text_mask=prompt_mask, # [2, 256]
+ text_states_2=prompt_embeds_2, # [2, 768]
+ ref_latents=ref_latents,
+ freqs_cos=freqs_cis[0], # [seqlen, head_dim]
+ freqs_sin=freqs_cis[1], # [seqlen, head_dim]
+ guidance=guidance_expand,
+ pipeline=self,
+ callback = callback,
+ )
+ if self._interrupt:
+ return [None]
+ if self.do_classifier_free_guidance :
+ if ip_cfg_scale > 0:
+ noise_pred_uncond, noise_pred_text, noise_pred_ip = ret
+ else:
+ noise_pred_uncond, noise_pred_text = noise_pred = ret
+ else:
+ noise_pred = ret[0]
+
+ # perform guidance
+ if self.do_classifier_free_guidance:
+ if cfg_star_rescale:
+ batch_size = noise_pred_text.shape[0]
+ positive_flat = noise_pred_text.view(batch_size, -1)
+ negative_flat = noise_pred_uncond.view(batch_size, -1)
+ dot_product = torch.sum(
+ positive_flat * negative_flat, dim=1, keepdim=True
+ )
+ squared_norm = torch.sum(negative_flat**2, dim=1, keepdim=True) + 1e-8
+ positive_flat, negative_flat = None, None
+ alpha = dot_product / squared_norm
+ noise_pred_uncond *= alpha
+
+ if ip_cfg_scale > 0:
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + start_scale * (noise_pred_ip-noise_pred_text)
+ start_scale -= step_scale
+ if i==0:
+ print(f'i={i}, noise_pred shape={noise_pred.shape}')
+ else:
+ noise_pred = noise_pred_uncond + self.guidance_scale * ( noise_pred_text - noise_pred_uncond)
+
+
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+ noise_pred = rescale_noise_cfg( noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale, )
+
+ # compute the previous noisy sample x_t -> x_t-1
+ if i2v_mode and i2v_condition_type == "token_replace":
+ latents = self.scheduler.step(
+ noise_pred[:, :, 1:, :, :], t, latents[:, :, 1:, :, :], **extra_step_kwargs, return_dict=False
+ )[0]
+ latents = torch.concat(
+ [img_latents, latents], dim=2
+ )
+ else:
+ latents = self.scheduler.step(
+ noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+ )[0]
+
+
+ noise_pred_uncond, noise_pred_text, noise_pred, noise_pred_ip, ret = None, None, None, None, None
+
+ if callback is not None:
+ callback(i, latents.squeeze(0), False)
+
+ if self.interrupt:
+ return [None]
+
+ # if load_latent:
+ # latents = torch.load("latent.pt")
+ # else:
+ # torch.save(latents, "latent.pt")
+
+
+ if mask_latents is not None:
+ latents = mask_latents * latents + (1 - mask_latents) * original_latents
+
+ if not output_type == "latent":
+ expand_temporal_dim = False
+ if len(latents.shape) == 4:
+ if isinstance(self.vae, AutoencoderKLCausal3D):
+ latents = latents.unsqueeze(2)
+ expand_temporal_dim = True
+ elif len(latents.shape) == 5:
+ pass
+ else:
+ raise ValueError(
+ f"Only support latents with shape (b, c, h, w) or (b, c, f, h, w), but got {latents.shape}."
+ )
+
+ if (
+ hasattr(self.vae.config, "shift_factor")
+ and self.vae.config.shift_factor
+ ):
+ latents = (
+ latents / self.vae.config.scaling_factor
+ + self.vae.config.shift_factor
+ )
+ else:
+ latents = latents / self.vae.config.scaling_factor
+
+ with torch.autocast(
+ device_type="cuda", dtype=vae_dtype, enabled=vae_autocast_enabled
+ ):
+ if enable_tiling:
+ self.vae.enable_tiling()
+ image = self.vae.decode(
+ latents, return_dict=False, generator=generator
+ )[0]
+ else:
+ image = self.vae.decode(
+ latents, return_dict=False, generator=generator
+ )[0]
+
+ if expand_temporal_dim or image.shape[2] == 1:
+ image = image.squeeze(2)
+
+ else:
+ image = latents
+
+ image = (image / 2 + 0.5).clamp(0, 1)
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+ image = image.cpu().float()
+
+ if i2v_mode and i2v_condition_type == "latent_concat":
+ image = image[:, :, 4:, :, :]
+
+ # Offload all models
+ self.maybe_free_model_hooks()
+
+ if not return_dict:
+ return image
+
+ return HunyuanVideoPipelineOutput(videos=image)
diff --git a/hyvideo/diffusion/schedulers/__init__.py b/hyvideo/diffusion/schedulers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..14f2ba33feb0a1a802a9a86818781a2a15140bd6
--- /dev/null
+++ b/hyvideo/diffusion/schedulers/__init__.py
@@ -0,0 +1 @@
+from .scheduling_flow_match_discrete import FlowMatchDiscreteScheduler
diff --git a/hyvideo/diffusion/schedulers/scheduling_flow_match_discrete.py b/hyvideo/diffusion/schedulers/scheduling_flow_match_discrete.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce4c0f2b1736739fd8da9612b74fcdd49a36e349
--- /dev/null
+++ b/hyvideo/diffusion/schedulers/scheduling_flow_match_discrete.py
@@ -0,0 +1,255 @@
+# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput, logging
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+
+
+
+@dataclass
+class FlowMatchDiscreteSchedulerOutput(BaseOutput):
+ """
+ Output class for the scheduler's `step` function output.
+
+ Args:
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ """
+
+ prev_sample: torch.FloatTensor
+
+
+class FlowMatchDiscreteScheduler(SchedulerMixin, ConfigMixin):
+ """
+ Euler scheduler.
+
+ This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+ methods the library implements for all schedulers such as loading and saving.
+
+ Args:
+ num_train_timesteps (`int`, defaults to 1000):
+ The number of diffusion steps to train the model.
+ timestep_spacing (`str`, defaults to `"linspace"`):
+ The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+ Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+ shift (`float`, defaults to 1.0):
+ The shift value for the timestep schedule.
+ reverse (`bool`, defaults to `True`):
+ Whether to reverse the timestep schedule.
+ """
+
+ _compatibles = []
+ order = 1
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ shift: float = 1.0,
+ reverse: bool = True,
+ solver: str = "euler",
+ n_tokens: Optional[int] = None,
+ ):
+ sigmas = torch.linspace(1, 0, num_train_timesteps + 1)
+
+ if not reverse:
+ sigmas = sigmas.flip(0)
+
+ self.sigmas = sigmas
+ # the value fed to model
+ self.timesteps = (sigmas[:-1] * num_train_timesteps).to(dtype=torch.float32)
+
+ self._step_index = None
+ self._begin_index = None
+
+ self.supported_solver = ["euler"]
+ if solver not in self.supported_solver:
+ raise ValueError(
+ f"Solver {solver} not supported. Supported solvers: {self.supported_solver}"
+ )
+
+ @property
+ def step_index(self):
+ """
+ The index counter for current timestep. It will increase 1 after each scheduler step.
+ """
+ return self._step_index
+
+ @property
+ def begin_index(self):
+ """
+ The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+ """
+ return self._begin_index
+
+ # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+ def set_begin_index(self, begin_index: int = 0):
+ """
+ Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+ Args:
+ begin_index (`int`):
+ The begin index for the scheduler.
+ """
+ self._begin_index = begin_index
+
+ def _sigma_to_t(self, sigma):
+ return sigma * self.config.num_train_timesteps
+
+ def set_timesteps(
+ self,
+ num_inference_steps: int,
+ device: Union[str, torch.device] = None,
+ n_tokens: int = None,
+ ):
+ """
+ Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+ Args:
+ num_inference_steps (`int`):
+ The number of diffusion steps used when generating samples with a pre-trained model.
+ device (`str` or `torch.device`, *optional*):
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+ n_tokens (`int`, *optional*):
+ Number of tokens in the input sequence.
+ """
+ self.num_inference_steps = num_inference_steps
+
+ sigmas = torch.linspace(1, 0, num_inference_steps + 1)
+ sigmas = self.sd3_time_shift(sigmas)
+
+ if not self.config.reverse:
+ sigmas = 1 - sigmas
+
+ self.sigmas = sigmas
+ self.timesteps = (sigmas[:-1] * self.config.num_train_timesteps).to(
+ dtype=torch.float32, device=device
+ )
+
+ # Reset step index
+ self._step_index = None
+
+ def index_for_timestep(self, timestep, schedule_timesteps=None):
+ if schedule_timesteps is None:
+ schedule_timesteps = self.timesteps
+
+ indices = (schedule_timesteps == timestep).nonzero()
+
+ # The sigma index that is taken for the **very** first `step`
+ # is always the second index (or the last index if there is only 1)
+ # This way we can ensure we don't accidentally skip a sigma in
+ # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+ pos = 1 if len(indices) > 1 else 0
+
+ return indices[pos].item()
+
+ def _init_step_index(self, timestep):
+ if self.begin_index is None:
+ if isinstance(timestep, torch.Tensor):
+ timestep = timestep.to(self.timesteps.device)
+ self._step_index = self.index_for_timestep(timestep)
+ else:
+ self._step_index = self._begin_index
+
+ def scale_model_input(
+ self, sample: torch.Tensor, timestep: Optional[int] = None
+ ) -> torch.Tensor:
+ return sample
+
+ def sd3_time_shift(self, t: torch.Tensor):
+ return (self.config.shift * t) / (1 + (self.config.shift - 1) * t)
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: Union[float, torch.FloatTensor],
+ sample: torch.FloatTensor,
+ return_dict: bool = True,
+ ) -> Union[FlowMatchDiscreteSchedulerOutput, Tuple]:
+ """
+ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ model_output (`torch.FloatTensor`):
+ The direct output from learned diffusion model.
+ timestep (`float`):
+ The current discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ A current instance of a sample created by the diffusion process.
+ generator (`torch.Generator`, *optional*):
+ A random number generator.
+ n_tokens (`int`, *optional*):
+ Number of tokens in the input sequence.
+ return_dict (`bool`):
+ Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+ tuple.
+
+ Returns:
+ [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+ If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+ returned, otherwise a tuple is returned where the first element is the sample tensor.
+ """
+
+ if (
+ isinstance(timestep, int)
+ or isinstance(timestep, torch.IntTensor)
+ or isinstance(timestep, torch.LongTensor)
+ ):
+ raise ValueError(
+ (
+ "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+ " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+ " one of the `scheduler.timesteps` as a timestep."
+ ),
+ )
+
+ if self.step_index is None:
+ self._init_step_index(timestep)
+
+ # Upcast to avoid precision issues when computing prev_sample
+ sample = sample.to(torch.float32)
+
+ dt = self.sigmas[self.step_index + 1] - self.sigmas[self.step_index]
+
+ if self.config.solver == "euler":
+ prev_sample = sample + model_output.to(torch.float32) * dt
+ else:
+ raise ValueError(
+ f"Solver {self.config.solver} not supported. Supported solvers: {self.supported_solver}"
+ )
+
+ # upon completion increase step index by one
+ self._step_index += 1
+
+ if not return_dict:
+ return (prev_sample,)
+
+ return FlowMatchDiscreteSchedulerOutput(prev_sample=prev_sample)
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/hyvideo/hunyuan.py b/hyvideo/hunyuan.py
new file mode 100644
index 0000000000000000000000000000000000000000..63bcdae23afb4f2e1171927f7aefc565f419aa5b
--- /dev/null
+++ b/hyvideo/hunyuan.py
@@ -0,0 +1,830 @@
+import os
+import time
+import random
+import functools
+from typing import List, Optional, Tuple, Union
+
+from pathlib import Path
+
+import torch
+import torch.distributed as dist
+from hyvideo.constants import PROMPT_TEMPLATE, NEGATIVE_PROMPT, PRECISION_TO_TYPE, NEGATIVE_PROMPT_I2V
+from hyvideo.vae import load_vae
+from hyvideo.modules import load_model
+from hyvideo.text_encoder import TextEncoder
+from hyvideo.utils.data_utils import align_to, get_closest_ratio, generate_crop_size_list
+from hyvideo.modules.posemb_layers import get_nd_rotary_pos_embed, get_nd_rotary_pos_embed_new
+from hyvideo.diffusion.schedulers import FlowMatchDiscreteScheduler
+from hyvideo.diffusion.pipelines import HunyuanVideoPipeline
+from PIL import Image
+import numpy as np
+import torchvision.transforms as transforms
+import cv2
+
+def pad_image(crop_img, size, color=(255, 255, 255), resize_ratio=1):
+ crop_h, crop_w = crop_img.shape[:2]
+ target_w, target_h = size
+ scale_h, scale_w = target_h / crop_h, target_w / crop_w
+ if scale_w > scale_h:
+ resize_h = int(target_h*resize_ratio)
+ resize_w = int(crop_w / crop_h * resize_h)
+ else:
+ resize_w = int(target_w*resize_ratio)
+ resize_h = int(crop_h / crop_w * resize_w)
+ crop_img = cv2.resize(crop_img, (resize_w, resize_h))
+ pad_left = (target_w - resize_w) // 2
+ pad_top = (target_h - resize_h) // 2
+ pad_right = target_w - resize_w - pad_left
+ pad_bottom = target_h - resize_h - pad_top
+ crop_img = cv2.copyMakeBorder(crop_img, pad_top, pad_bottom, pad_left, pad_right, cv2.BORDER_CONSTANT, value=color)
+ return crop_img
+
+
+
+
+def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
+ num_images, num_image_patches, embed_dim = image_features.shape
+ batch_size, sequence_length = input_ids.shape
+ left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
+ # 1. Create a mask to know where special image tokens are
+ special_image_token_mask = input_ids == self.config.image_token_index
+ num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+ # Compute the maximum embed dimension
+ max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
+ batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
+
+ # 2. Compute the positions where text should be written
+ # Calculate new positions for text tokens in merged image-text sequence.
+ # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
+ # `torch.cumsum` computes how each image token shifts subsequent text token positions.
+ # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+ new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
+ nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
+ if left_padding:
+ new_token_positions += nb_image_pad[:, None] # offset for left padding
+ text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+
+ # 3. Create the full embedding, already padded to the maximum position
+ final_embedding = torch.zeros(
+ batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+ )
+ final_attention_mask = torch.zeros(
+ batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
+ )
+ if labels is not None:
+ final_labels = torch.full(
+ (batch_size, max_embed_dim), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
+ )
+ # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+ # set the corresponding tensors into their correct target device.
+ target_device = inputs_embeds.device
+ batch_indices, non_image_indices, text_to_overwrite = (
+ batch_indices.to(target_device),
+ non_image_indices.to(target_device),
+ text_to_overwrite.to(target_device),
+ )
+ attention_mask = attention_mask.to(target_device)
+
+ # 4. Fill the embeddings based on the mask. If we have ["hey" "", "how", "are"]
+ # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
+ final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+ final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+ if labels is not None:
+ final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
+
+ # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
+ image_to_overwrite = torch.full(
+ (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
+ )
+ image_to_overwrite[batch_indices, text_to_overwrite] = False
+ image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+
+ if image_to_overwrite.sum() != image_features.shape[:-1].numel():
+ raise ValueError(
+ f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
+ f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
+ )
+
+ final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+ final_attention_mask |= image_to_overwrite
+ position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+
+ # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
+ batch_indices, pad_indices = torch.where(input_ids == self.pad_token_id)
+ indices_to_mask = new_token_positions[batch_indices, pad_indices]
+
+ final_embedding[batch_indices, indices_to_mask] = 0
+
+ if labels is None:
+ final_labels = None
+
+ return final_embedding, final_attention_mask, final_labels, position_ids
+
+def patched_llava_forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ pixel_values: torch.FloatTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ vision_feature_layer: Optional[int] = None,
+ vision_feature_select_strategy: Optional[str] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ num_logits_to_keep: int = 0,
+):
+ from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast
+
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ vision_feature_layer = (
+ vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+ )
+ vision_feature_select_strategy = (
+ vision_feature_select_strategy
+ if vision_feature_select_strategy is not None
+ else self.config.vision_feature_select_strategy
+ )
+
+ if (input_ids is None) ^ (inputs_embeds is not None):
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+ if pixel_values is not None and inputs_embeds is not None:
+ raise ValueError(
+ "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+ )
+
+ if inputs_embeds is None:
+ inputs_embeds = self.get_input_embeddings()(input_ids)
+
+ image_features = None
+ if pixel_values is not None:
+ image_features = self.get_image_features(
+ pixel_values=pixel_values,
+ vision_feature_layer=vision_feature_layer,
+ vision_feature_select_strategy=vision_feature_select_strategy,
+ )
+
+
+ inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
+ image_features, inputs_embeds, input_ids, attention_mask, labels
+ )
+ cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
+
+
+ outputs = self.language_model(
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ cache_position=cache_position,
+ num_logits_to_keep=num_logits_to_keep,
+ )
+
+ logits = outputs[0]
+
+ loss = None
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return LlavaCausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ image_hidden_states=image_features if pixel_values is not None else None,
+ )
+
+class DataPreprocess(object):
+ def __init__(self):
+ self.llava_size = (336, 336)
+ self.llava_transform = transforms.Compose(
+ [
+ transforms.Resize(self.llava_size, interpolation=transforms.InterpolationMode.BILINEAR),
+ transforms.ToTensor(),
+ transforms.Normalize((0.48145466, 0.4578275, 0.4082107), (0.26862954, 0.26130258, 0.27577711)),
+ ]
+ )
+
+ def get_batch(self, image , size):
+ image = np.asarray(image)
+ llava_item_image = pad_image(image.copy(), self.llava_size)
+ uncond_llava_item_image = np.ones_like(llava_item_image) * 255
+ cat_item_image = pad_image(image.copy(), size)
+
+ llava_item_tensor = self.llava_transform(Image.fromarray(llava_item_image.astype(np.uint8)))
+ uncond_llava_item_tensor = self.llava_transform(Image.fromarray(uncond_llava_item_image))
+ cat_item_tensor = torch.from_numpy(cat_item_image.copy()).permute((2, 0, 1)) / 255.0
+ # batch = {
+ # "pixel_value_llava": llava_item_tensor.unsqueeze(0),
+ # "uncond_pixel_value_llava": uncond_llava_item_tensor.unsqueeze(0),
+ # 'pixel_value_ref': cat_item_tensor.unsqueeze(0),
+ # }
+ return llava_item_tensor.unsqueeze(0), uncond_llava_item_tensor.unsqueeze(0), cat_item_tensor.unsqueeze(0)
+
+class Inference(object):
+ def __init__(
+ self,
+ i2v,
+ enable_cfg,
+ vae,
+ vae_kwargs,
+ text_encoder,
+ model,
+ text_encoder_2=None,
+ pipeline=None,
+ device=None,
+ ):
+ self.i2v = i2v
+ self.enable_cfg = enable_cfg
+ self.vae = vae
+ self.vae_kwargs = vae_kwargs
+
+ self.text_encoder = text_encoder
+ self.text_encoder_2 = text_encoder_2
+
+ self.model = model
+ self.pipeline = pipeline
+
+ self.device = "cuda"
+
+
+
+ @classmethod
+ def from_pretrained(cls, model_filepath, text_encoder_filepath, dtype = torch.bfloat16, VAE_dtype = torch.float16, mixed_precision_transformer =torch.bfloat16 , **kwargs):
+
+ device = "cuda"
+
+ import transformers
+ transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.forward = patched_llava_forward # force legacy behaviour to be able to use tansformers v>(4.47)
+ transformers.models.llava.modeling_llava.LlavaForConditionalGeneration._merge_input_ids_with_image_features = _merge_input_ids_with_image_features
+
+ torch.set_grad_enabled(False)
+ text_len = 512
+ latent_channels = 16
+ precision = "bf16"
+ vae_precision = "fp32" if VAE_dtype == torch.float32 else "bf16"
+ embedded_cfg_scale = 6
+ i2v_condition_type = None
+ i2v_mode = "i2v" in model_filepath[0]
+ custom = False
+ if i2v_mode:
+ model_id = "HYVideo-T/2"
+ i2v_condition_type = "token_replace"
+ elif "custom" in model_filepath[0]:
+ model_id = "HYVideo-T/2-custom"
+ custom = True
+ else:
+ model_id = "HYVideo-T/2-cfgdistill"
+
+ if i2v_mode and i2v_condition_type == "latent_concat":
+ in_channels = latent_channels * 2 + 1
+ image_embed_interleave = 2
+ elif i2v_mode and i2v_condition_type == "token_replace":
+ in_channels = latent_channels
+ image_embed_interleave = 4
+ else:
+ in_channels = latent_channels
+ image_embed_interleave = 1
+ out_channels = latent_channels
+ pinToMemory = kwargs.pop("pinToMemory", False)
+ partialPinning = kwargs.pop("partialPinning", False)
+ factor_kwargs = kwargs | {"device": "meta", "dtype": PRECISION_TO_TYPE[precision]}
+
+ if embedded_cfg_scale and i2v_mode:
+ factor_kwargs["guidance_embed"] = True
+
+ model = load_model(
+ model = model_id,
+ i2v_condition_type = i2v_condition_type,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ factor_kwargs=factor_kwargs,
+ )
+
+
+ from mmgp import offload
+ # model = Inference.load_state_dict(args, model, model_filepath)
+
+ # model_filepath ="c:/temp/hc/mp_rank_00_model_states.pt"
+ offload.load_model_data(model, model_filepath, pinToMemory = pinToMemory, partialPinning = partialPinning)
+ pass
+ # offload.save_model(model, "hunyuan_video_custom_720_bf16.safetensors")
+ # offload.save_model(model, "hunyuan_video_custom_720_quanto_bf16_int8.safetensors", do_quantize= True)
+
+ model.mixed_precision = mixed_precision_transformer
+
+ if model.mixed_precision :
+ model._lock_dtype = torch.float32
+ model.lock_layers_dtypes(torch.float32)
+ model.eval()
+
+ # ============================= Build extra models ========================
+ # VAE
+ if custom:
+ vae_configpath = "ckpts/hunyuan_video_custom_VAE_config.json"
+ vae_filepath = "ckpts/hunyuan_video_custom_VAE_fp32.safetensors"
+ else:
+ vae_configpath = "ckpts/hunyuan_video_VAE_config.json"
+ vae_filepath = "ckpts/hunyuan_video_VAE_fp32.safetensors"
+
+ # config = AutoencoderKLCausal3D.load_config("ckpts/hunyuan_video_VAE_config.json")
+ # config = AutoencoderKLCausal3D.load_config("c:/temp/hvae/config_vae.json")
+
+ vae, _, s_ratio, t_ratio = load_vae( "884-16c-hy", vae_path= vae_filepath, vae_config_path= vae_configpath, vae_precision= vae_precision, device= "cpu", )
+
+ vae._model_dtype = torch.float32 if VAE_dtype == torch.float32 else torch.bfloat16
+ vae_kwargs = {"s_ratio": s_ratio, "t_ratio": t_ratio}
+ enable_cfg = False
+ # Text encoder
+ if i2v_mode:
+ text_encoder = "llm-i2v"
+ tokenizer = "llm-i2v"
+ prompt_template = "dit-llm-encode-i2v"
+ prompt_template_video = "dit-llm-encode-video-i2v"
+ elif custom :
+ text_encoder = "llm-i2v"
+ tokenizer = "llm-i2v"
+ prompt_template = "dit-llm-encode"
+ prompt_template_video = "dit-llm-encode-video"
+ enable_cfg = True
+ else:
+ text_encoder = "llm"
+ tokenizer = "llm"
+ prompt_template = "dit-llm-encode"
+ prompt_template_video = "dit-llm-encode-video"
+
+ if prompt_template_video is not None:
+ crop_start = PROMPT_TEMPLATE[prompt_template_video].get( "crop_start", 0 )
+ elif prompt_template is not None:
+ crop_start = PROMPT_TEMPLATE[prompt_template].get("crop_start", 0)
+ else:
+ crop_start = 0
+ max_length = text_len + crop_start
+
+ # prompt_template
+ prompt_template = PROMPT_TEMPLATE[prompt_template] if prompt_template is not None else None
+
+ # prompt_template_video
+ prompt_template_video = PROMPT_TEMPLATE[prompt_template_video] if prompt_template_video is not None else None
+
+
+ text_encoder = TextEncoder(
+ text_encoder_type=text_encoder,
+ max_length=max_length,
+ text_encoder_precision="fp16",
+ tokenizer_type=tokenizer,
+ i2v_mode=i2v_mode,
+ prompt_template=prompt_template,
+ prompt_template_video=prompt_template_video,
+ hidden_state_skip_layer=2,
+ apply_final_norm=False,
+ reproduce=True,
+ device="cpu",
+ image_embed_interleave=image_embed_interleave,
+ text_encoder_path = text_encoder_filepath
+ )
+
+ text_encoder_2 = TextEncoder(
+ text_encoder_type="clipL",
+ max_length=77,
+ text_encoder_precision="fp16",
+ tokenizer_type="clipL",
+ reproduce=True,
+ device="cpu",
+ )
+
+ return cls(
+ i2v=i2v_mode,
+ enable_cfg = enable_cfg,
+ vae=vae,
+ vae_kwargs=vae_kwargs,
+ text_encoder=text_encoder,
+ text_encoder_2=text_encoder_2,
+ model=model,
+ device=device,
+ )
+
+
+
+class HunyuanVideoSampler(Inference):
+ def __init__(
+ self,
+ i2v,
+ enable_cfg,
+ vae,
+ vae_kwargs,
+ text_encoder,
+ model,
+ text_encoder_2=None,
+ pipeline=None,
+ device=0,
+ ):
+ super().__init__(
+ i2v,
+ enable_cfg,
+ vae,
+ vae_kwargs,
+ text_encoder,
+ model,
+ text_encoder_2=text_encoder_2,
+ pipeline=pipeline,
+ device=device,
+ )
+
+ self.i2v_mode = i2v
+ self.enable_cfg = enable_cfg
+ self.pipeline = self.load_diffusion_pipeline(
+ vae=self.vae,
+ text_encoder=self.text_encoder,
+ text_encoder_2=self.text_encoder_2,
+ model=self.model,
+ device=self.device,
+ )
+
+ if self.i2v_mode:
+ self.default_negative_prompt = NEGATIVE_PROMPT_I2V
+ else:
+ self.default_negative_prompt = NEGATIVE_PROMPT
+
+ @property
+ def _interrupt(self):
+ return self.pipeline._interrupt
+
+ @_interrupt.setter
+ def _interrupt(self, value):
+ self.pipeline._interrupt =value
+
+ def load_diffusion_pipeline(
+ self,
+ vae,
+ text_encoder,
+ text_encoder_2,
+ model,
+ scheduler=None,
+ device=None,
+ progress_bar_config=None,
+ #data_type="video",
+ ):
+ """Load the denoising scheduler for inference."""
+ if scheduler is None:
+ scheduler = FlowMatchDiscreteScheduler(
+ shift=6.0,
+ reverse=True,
+ solver="euler",
+ )
+
+ pipeline = HunyuanVideoPipeline(
+ vae=vae,
+ text_encoder=text_encoder,
+ text_encoder_2=text_encoder_2,
+ transformer=model,
+ scheduler=scheduler,
+ progress_bar_config=progress_bar_config,
+ )
+
+ return pipeline
+
+ def get_rotary_pos_embed_new(self, video_length, height, width, concat_dict={}):
+ target_ndim = 3
+ ndim = 5 - 2
+ latents_size = [(video_length-1)//4+1 , height//8, width//8]
+
+ if isinstance(self.model.patch_size, int):
+ assert all(s % self.model.patch_size == 0 for s in latents_size), \
+ f"Latent size(last {ndim} dimensions) should be divisible by patch size({self.model.patch_size}), " \
+ f"but got {latents_size}."
+ rope_sizes = [s // self.model.patch_size for s in latents_size]
+ elif isinstance(self.model.patch_size, list):
+ assert all(s % self.model.patch_size[idx] == 0 for idx, s in enumerate(latents_size)), \
+ f"Latent size(last {ndim} dimensions) should be divisible by patch size({self.model.patch_size}), " \
+ f"but got {latents_size}."
+ rope_sizes = [s // self.model.patch_size[idx] for idx, s in enumerate(latents_size)]
+
+ if len(rope_sizes) != target_ndim:
+ rope_sizes = [1] * (target_ndim - len(rope_sizes)) + rope_sizes # time axis
+ head_dim = self.model.hidden_size // self.model.heads_num
+ rope_dim_list = self.model.rope_dim_list
+ if rope_dim_list is None:
+ rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
+ assert sum(rope_dim_list) == head_dim, "sum(rope_dim_list) should equal to head_dim of attention layer"
+ freqs_cos, freqs_sin = get_nd_rotary_pos_embed_new(rope_dim_list,
+ rope_sizes,
+ theta=256,
+ use_real=True,
+ theta_rescale_factor=1,
+ concat_dict=concat_dict)
+ return freqs_cos, freqs_sin
+
+ def get_rotary_pos_embed(self, video_length, height, width, enable_riflex = False):
+ target_ndim = 3
+ ndim = 5 - 2
+ # 884
+ vae = "884-16c-hy"
+ if "884" in vae:
+ latents_size = [(video_length - 1) // 4 + 1, height // 8, width // 8]
+ elif "888" in vae:
+ latents_size = [(video_length - 1) // 8 + 1, height // 8, width // 8]
+ else:
+ latents_size = [video_length, height // 8, width // 8]
+
+ if isinstance(self.model.patch_size, int):
+ assert all(s % self.model.patch_size == 0 for s in latents_size), (
+ f"Latent size(last {ndim} dimensions) should be divisible by patch size({self.model.patch_size}), "
+ f"but got {latents_size}."
+ )
+ rope_sizes = [s // self.model.patch_size for s in latents_size]
+ elif isinstance(self.model.patch_size, list):
+ assert all(
+ s % self.model.patch_size[idx] == 0
+ for idx, s in enumerate(latents_size)
+ ), (
+ f"Latent size(last {ndim} dimensions) should be divisible by patch size({self.model.patch_size}), "
+ f"but got {latents_size}."
+ )
+ rope_sizes = [
+ s // self.model.patch_size[idx] for idx, s in enumerate(latents_size)
+ ]
+
+ if len(rope_sizes) != target_ndim:
+ rope_sizes = [1] * (target_ndim - len(rope_sizes)) + rope_sizes # time axis
+ head_dim = self.model.hidden_size // self.model.heads_num
+ rope_dim_list = self.model.rope_dim_list
+ if rope_dim_list is None:
+ rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
+ assert (
+ sum(rope_dim_list) == head_dim
+ ), "sum(rope_dim_list) should equal to head_dim of attention layer"
+ freqs_cos, freqs_sin = get_nd_rotary_pos_embed(
+ rope_dim_list,
+ rope_sizes,
+ theta=256,
+ use_real=True,
+ theta_rescale_factor=1,
+ L_test = (video_length - 1) // 4 + 1,
+ enable_riflex = enable_riflex
+ )
+ return freqs_cos, freqs_sin
+
+
+ def generate(
+ self,
+ input_prompt,
+ input_ref_images = None,
+ height=192,
+ width=336,
+ frame_num=129,
+ seed=None,
+ n_prompt=None,
+ sampling_steps=50,
+ guide_scale=1.0,
+ shift=5.0,
+ embedded_guidance_scale=6.0,
+ batch_size=1,
+ num_videos_per_prompt=1,
+ i2v_resolution="720p",
+ image_start=None,
+ enable_riflex = False,
+ i2v_condition_type: str = "token_replace",
+ i2v_stability=True,
+ VAE_tile_size = None,
+ joint_pass = False,
+ cfg_star_switch = False,
+ **kwargs,
+ ):
+
+ if VAE_tile_size != None:
+ self.vae.tile_sample_min_tsize = VAE_tile_size["tile_sample_min_tsize"]
+ self.vae.tile_latent_min_tsize = VAE_tile_size["tile_latent_min_tsize"]
+ self.vae.tile_sample_min_size = VAE_tile_size["tile_sample_min_size"]
+ self.vae.tile_latent_min_size = VAE_tile_size["tile_latent_min_size"]
+ self.vae.tile_overlap_factor = VAE_tile_size["tile_overlap_factor"]
+
+ i2v_mode= self.i2v_mode
+ if not self.enable_cfg:
+ guide_scale=1.0
+
+
+ out_dict = dict()
+
+ # ========================================================================
+ # Arguments: seed
+ # ========================================================================
+ if isinstance(seed, torch.Tensor):
+ seed = seed.tolist()
+ if seed is None:
+ seeds = [
+ random.randint(0, 1_000_000)
+ for _ in range(batch_size * num_videos_per_prompt)
+ ]
+ elif isinstance(seed, int):
+ seeds = [
+ seed + i
+ for _ in range(batch_size)
+ for i in range(num_videos_per_prompt)
+ ]
+ elif isinstance(seed, (list, tuple)):
+ if len(seed) == batch_size:
+ seeds = [
+ int(seed[i]) + j
+ for i in range(batch_size)
+ for j in range(num_videos_per_prompt)
+ ]
+ elif len(seed) == batch_size * num_videos_per_prompt:
+ seeds = [int(s) for s in seed]
+ else:
+ raise ValueError(
+ f"Length of seed must be equal to number of prompt(batch_size) or "
+ f"batch_size * num_videos_per_prompt ({batch_size} * {num_videos_per_prompt}), got {seed}."
+ )
+ else:
+ raise ValueError(
+ f"Seed must be an integer, a list of integers, or None, got {seed}."
+ )
+ from wan.utils.utils import seed_everything
+ seed_everything(seed)
+ generator = [torch.Generator("cuda").manual_seed(seed) for seed in seeds]
+ # generator = [torch.Generator(self.device).manual_seed(seed) for seed in seeds]
+ out_dict["seeds"] = seeds
+
+ # ========================================================================
+ # Arguments: target_width, target_height, target_frame_num
+ # ========================================================================
+ if width <= 0 or height <= 0 or frame_num <= 0:
+ raise ValueError(
+ f"`height` and `width` and `frame_num` must be positive integers, got height={height}, width={width}, frame_num={frame_num}"
+ )
+ if (frame_num - 1) % 4 != 0:
+ raise ValueError(
+ f"`frame_num-1` must be a multiple of 4, got {frame_num}"
+ )
+
+ target_height = align_to(height, 16)
+ target_width = align_to(width, 16)
+ target_frame_num = frame_num
+
+ out_dict["size"] = (target_height, target_width, target_frame_num)
+
+ if input_ref_images != None:
+ # ip_cfg_scale = 3.0
+ ip_cfg_scale = 0
+ denoise_strength = 1
+ # guide_scale=7.5
+ # shift=13
+ name = "person"
+ input_ref_images = input_ref_images[0]
+
+ # ========================================================================
+ # Arguments: prompt, new_prompt, negative_prompt
+ # ========================================================================
+ if not isinstance(input_prompt, str):
+ raise TypeError(f"`prompt` must be a string, but got {type(input_prompt)}")
+ input_prompt = [input_prompt.strip()]
+
+ # negative prompt
+ if n_prompt is None or n_prompt == "":
+ n_prompt = self.default_negative_prompt
+ if guide_scale == 1.0:
+ n_prompt = ""
+ if not isinstance(n_prompt, str):
+ raise TypeError(
+ f"`negative_prompt` must be a string, but got {type(n_prompt)}"
+ )
+ n_prompt = [n_prompt.strip()]
+
+ # ========================================================================
+ # Scheduler
+ # ========================================================================
+ scheduler = FlowMatchDiscreteScheduler(
+ shift=shift,
+ reverse=True,
+ solver="euler"
+ )
+ self.pipeline.scheduler = scheduler
+
+ # ---------------------------------
+ # Reference condition
+ # ---------------------------------
+ img_latents = None
+ semantic_images = None
+ denoise_strength = 0
+ ip_cfg_scale = 0
+ if i2v_mode:
+ if i2v_resolution == "720p":
+ bucket_hw_base_size = 960
+ elif i2v_resolution == "540p":
+ bucket_hw_base_size = 720
+ elif i2v_resolution == "360p":
+ bucket_hw_base_size = 480
+ else:
+ raise ValueError(f"i2v_resolution: {i2v_resolution} must be in [360p, 540p, 720p]")
+
+ # semantic_images = [Image.open(i2v_image_path).convert('RGB')]
+ semantic_images = [image_start.convert('RGB')] #
+
+ origin_size = semantic_images[0].size
+
+ crop_size_list = generate_crop_size_list(bucket_hw_base_size, 32)
+ aspect_ratios = np.array([round(float(h)/float(w), 5) for h, w in crop_size_list])
+ closest_size, closest_ratio = get_closest_ratio(origin_size[1], origin_size[0], aspect_ratios, crop_size_list)
+ ref_image_transform = transforms.Compose([
+ transforms.Resize(closest_size),
+ transforms.CenterCrop(closest_size),
+ transforms.ToTensor(),
+ transforms.Normalize([0.5], [0.5])
+ ])
+
+ semantic_image_pixel_values = [ref_image_transform(semantic_image) for semantic_image in semantic_images]
+ semantic_image_pixel_values = torch.cat(semantic_image_pixel_values).unsqueeze(0).unsqueeze(2).to(self.device)
+
+ with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
+ img_latents = self.pipeline.vae.encode(semantic_image_pixel_values).latent_dist.mode() # B, C, F, H, W
+ img_latents.mul_(self.pipeline.vae.config.scaling_factor)
+
+ target_height, target_width = closest_size
+
+ # ========================================================================
+ # Build Rope freqs
+ # ========================================================================
+
+ if input_ref_images == None:
+ freqs_cos, freqs_sin = self.get_rotary_pos_embed(target_frame_num, target_height, target_width, enable_riflex)
+ else:
+ concat_dict = {'mode': 'timecat-w', 'bias': -1}
+ freqs_cos, freqs_sin = self.get_rotary_pos_embed_new(target_frame_num, target_height, target_width, concat_dict)
+
+ n_tokens = freqs_cos.shape[0]
+
+
+ callback = kwargs.pop("callback", None)
+ callback_steps = kwargs.pop("callback_steps", None)
+ # ========================================================================
+ # Pipeline inference
+ # ========================================================================
+ start_time = time.time()
+
+
+ # "pixel_value_llava": llava_item_tensor.unsqueeze(0),
+ # "uncond_pixel_value_llava": uncond_llava_item_tensor.unsqueeze(0),
+ # 'pixel_value_ref': cat_item_tensor.unsqueeze(0),
+ if input_ref_images == None:
+ pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref = None, None, None
+ name = None
+ else:
+ pixel_value_llava, uncond_pixel_value_llava, pixel_value_ref = DataPreprocess().get_batch(input_ref_images, (target_width, target_height))
+ samples = self.pipeline(
+ prompt=input_prompt,
+ height=target_height,
+ width=target_width,
+ video_length=target_frame_num,
+ num_inference_steps=sampling_steps,
+ guidance_scale=guide_scale,
+ negative_prompt=n_prompt,
+ num_videos_per_prompt=num_videos_per_prompt,
+ generator=generator,
+ output_type="pil",
+ name = name,
+ pixel_value_llava = pixel_value_llava,
+ uncond_pixel_value_llava=uncond_pixel_value_llava,
+ pixel_value_ref=pixel_value_ref,
+ denoise_strength=denoise_strength,
+ ip_cfg_scale=ip_cfg_scale,
+ freqs_cis=(freqs_cos, freqs_sin),
+ n_tokens=n_tokens,
+ embedded_guidance_scale=embedded_guidance_scale,
+ data_type="video" if target_frame_num > 1 else "image",
+ is_progress_bar=True,
+ vae_ver="884-16c-hy",
+ enable_tiling=True,
+ i2v_mode=i2v_mode,
+ i2v_condition_type=i2v_condition_type,
+ i2v_stability=i2v_stability,
+ img_latents=img_latents,
+ semantic_images=semantic_images,
+ joint_pass = joint_pass,
+ cfg_star_rescale = cfg_star_switch,
+ callback = callback,
+ callback_steps = callback_steps,
+ )[0]
+ gen_time = time.time() - start_time
+ if samples == None:
+ return None
+ samples = samples.sub_(0.5).mul_(2).squeeze(0)
+
+ return samples
diff --git a/hyvideo/modules/__init__.py b/hyvideo/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..45d8819812fcdab40bfc7fe6fc61835dcc1274e0
--- /dev/null
+++ b/hyvideo/modules/__init__.py
@@ -0,0 +1,26 @@
+from .models import HYVideoDiffusionTransformer, HUNYUAN_VIDEO_CONFIG
+
+
+def load_model(model, i2v_condition_type, in_channels, out_channels, factor_kwargs):
+ """load hunyuan video model
+
+ Args:
+ args (dict): model args
+ in_channels (int): input channels number
+ out_channels (int): output channels number
+ factor_kwargs (dict): factor kwargs
+
+ Returns:
+ model (nn.Module): The hunyuan video model
+ """
+ if model in HUNYUAN_VIDEO_CONFIG.keys():
+ model = HYVideoDiffusionTransformer(
+ i2v_condition_type = i2v_condition_type,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ **HUNYUAN_VIDEO_CONFIG[model],
+ **factor_kwargs,
+ )
+ return model
+ else:
+ raise NotImplementedError()
diff --git a/hyvideo/modules/activation_layers.py b/hyvideo/modules/activation_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8774c26ceef6081482ca0dbbf930b207d4ac03b
--- /dev/null
+++ b/hyvideo/modules/activation_layers.py
@@ -0,0 +1,23 @@
+import torch.nn as nn
+
+
+def get_activation_layer(act_type):
+ """get activation layer
+
+ Args:
+ act_type (str): the activation type
+
+ Returns:
+ torch.nn.functional: the activation layer
+ """
+ if act_type == "gelu":
+ return lambda: nn.GELU()
+ elif act_type == "gelu_tanh":
+ # Approximate `tanh` requires torch >= 1.13
+ return lambda: nn.GELU(approximate="tanh")
+ elif act_type == "relu":
+ return nn.ReLU
+ elif act_type == "silu":
+ return nn.SiLU
+ else:
+ raise ValueError(f"Unknown activation type: {act_type}")
diff --git a/hyvideo/modules/attenion.py b/hyvideo/modules/attenion.py
new file mode 100644
index 0000000000000000000000000000000000000000..611fe02978c4a5f69e4160dc31bde92651c1a58b
--- /dev/null
+++ b/hyvideo/modules/attenion.py
@@ -0,0 +1,362 @@
+import importlib.metadata
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from importlib.metadata import version
+
+def clear_list(l):
+ for i in range(len(l)):
+ l[i] = None
+
+try:
+ import flash_attn
+ from flash_attn.flash_attn_interface import _flash_attn_forward
+ from flash_attn.flash_attn_interface import flash_attn_varlen_func
+except ImportError:
+ flash_attn = None
+ flash_attn_varlen_func = None
+ _flash_attn_forward = None
+
+try:
+ from xformers.ops import memory_efficient_attention
+except ImportError:
+ memory_efficient_attention = None
+
+try:
+ from sageattention import sageattn_varlen
+ def sageattn_varlen_wrapper(
+ q,
+ k,
+ v,
+ cu_seqlens_q,
+ cu_seqlens_kv,
+ max_seqlen_q,
+ max_seqlen_kv,
+ ):
+ return sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
+except ImportError:
+ sageattn_varlen_wrapper = None
+
+try:
+ from sageattention import sageattn
+ @torch.compiler.disable()
+ def sageattn_wrapper(
+ qkv_list,
+ attention_length
+ ):
+ q,k, v = qkv_list
+ padding_length = q.shape[1] -attention_length
+ q = q[:, :attention_length, :, : ]
+ k = k[:, :attention_length, :, : ]
+ v = v[:, :attention_length, :, : ]
+
+ o = sageattn(q, k, v, tensor_layout="NHD")
+ del q, k ,v
+ clear_list(qkv_list)
+
+ if padding_length > 0:
+ o = torch.cat([o, torch.empty( (o.shape[0], padding_length, *o.shape[-2:]), dtype= o.dtype, device=o.device ) ], 1)
+
+ return o
+
+except ImportError:
+ sageattn = None
+
+
+def get_attention_modes():
+ ret = ["sdpa", "auto"]
+ if flash_attn != None:
+ ret.append("flash")
+ if memory_efficient_attention != None:
+ ret.append("xformers")
+ if sageattn_varlen_wrapper != None:
+ ret.append("sage")
+ if sageattn != None and version("sageattention").startswith("2") :
+ ret.append("sage2")
+
+ return ret
+
+
+
+MEMORY_LAYOUT = {
+ "sdpa": (
+ lambda x: x.transpose(1, 2),
+ lambda x: x.transpose(1, 2),
+ ),
+ "xformers": (
+ lambda x: x,
+ lambda x: x,
+ ),
+ "sage2": (
+ lambda x: x,
+ lambda x: x,
+ ),
+ "sage": (
+ lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
+ lambda x: x,
+ ),
+ "flash": (
+ lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
+ lambda x: x,
+ ),
+ "torch": (
+ lambda x: x.transpose(1, 2),
+ lambda x: x.transpose(1, 2),
+ ),
+ "vanilla": (
+ lambda x: x.transpose(1, 2),
+ lambda x: x.transpose(1, 2),
+ ),
+}
+
+@torch.compiler.disable()
+def sdpa_wrapper(
+ qkv_list,
+ attention_length
+ ):
+ q,k, v = qkv_list
+ padding_length = q.shape[2] -attention_length
+ q = q[:, :, :attention_length, :]
+ k = k[:, :, :attention_length, :]
+ v = v[:, :, :attention_length, :]
+
+ o = F.scaled_dot_product_attention(
+ q, k, v, attn_mask=None, is_causal=False
+ )
+ del q, k ,v
+ clear_list(qkv_list)
+
+ if padding_length > 0:
+ o = torch.cat([o, torch.empty( (*o.shape[:2], padding_length, o.shape[-1]), dtype= o.dtype, device=o.device ) ], 2)
+
+ return o
+
+def get_cu_seqlens(text_mask, img_len):
+ """Calculate cu_seqlens_q, cu_seqlens_kv using text_mask and img_len
+
+ Args:
+ text_mask (torch.Tensor): the mask of text
+ img_len (int): the length of image
+
+ Returns:
+ torch.Tensor: the calculated cu_seqlens for flash attention
+ """
+ batch_size = text_mask.shape[0]
+ text_len = text_mask.sum(dim=1)
+ max_len = text_mask.shape[1] + img_len
+
+ cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device="cuda")
+
+ for i in range(batch_size):
+ s = text_len[i] + img_len
+ s1 = i * max_len + s
+ s2 = (i + 1) * max_len
+ cu_seqlens[2 * i + 1] = s1
+ cu_seqlens[2 * i + 2] = s2
+
+ return cu_seqlens
+
+
+def attention(
+ qkv_list,
+ mode="flash",
+ drop_rate=0,
+ attn_mask=None,
+ causal=False,
+ cu_seqlens_q=None,
+ cu_seqlens_kv=None,
+ max_seqlen_q=None,
+ max_seqlen_kv=None,
+ batch_size=1,
+):
+ """
+ Perform QKV self attention.
+
+ Args:
+ q (torch.Tensor): Query tensor with shape [b, s, a, d], where a is the number of heads.
+ k (torch.Tensor): Key tensor with shape [b, s1, a, d]
+ v (torch.Tensor): Value tensor with shape [b, s1, a, d]
+ mode (str): Attention mode. Choose from 'self_flash', 'cross_flash', 'torch', and 'vanilla'.
+ drop_rate (float): Dropout rate in attention map. (default: 0)
+ attn_mask (torch.Tensor): Attention mask with shape [b, s1] (cross_attn), or [b, a, s, s1] (torch or vanilla).
+ (default: None)
+ causal (bool): Whether to use causal attention. (default: False)
+ cu_seqlens_q (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+ used to index into q.
+ cu_seqlens_kv (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+ used to index into kv.
+ max_seqlen_q (int): The maximum sequence length in the batch of q.
+ max_seqlen_kv (int): The maximum sequence length in the batch of k and v.
+
+ Returns:
+ torch.Tensor: Output tensor after self attention with shape [b, s, ad]
+ """
+ pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
+ q , k , v = qkv_list
+ clear_list(qkv_list)
+ del qkv_list
+ padding_length = 0
+ # if attn_mask == None and mode == "sdpa":
+ # padding_length = q.shape[1] - cu_seqlens_q
+ # q = q[:, :cu_seqlens_q, ... ]
+ # k = k[:, :cu_seqlens_kv, ... ]
+ # v = v[:, :cu_seqlens_kv, ... ]
+
+ q = pre_attn_layout(q)
+ k = pre_attn_layout(k)
+ v = pre_attn_layout(v)
+
+ if mode == "torch":
+ if attn_mask is not None and attn_mask.dtype != torch.bool:
+ attn_mask = attn_mask.to(q.dtype)
+ x = F.scaled_dot_product_attention(
+ q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal
+ )
+
+ elif mode == "sdpa":
+ # if attn_mask is not None and attn_mask.dtype != torch.bool:
+ # attn_mask = attn_mask.to(q.dtype)
+ # x = F.scaled_dot_product_attention(
+ # q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal
+ # )
+ assert attn_mask==None
+ qkv_list = [q, k, v]
+ del q, k , v
+ x = sdpa_wrapper( qkv_list, cu_seqlens_q )
+
+ elif mode == "xformers":
+ x = memory_efficient_attention(
+ q, k, v , attn_bias= attn_mask
+ )
+
+ elif mode == "sage2":
+ qkv_list = [q, k, v]
+ del q, k , v
+ x = sageattn_wrapper(qkv_list, cu_seqlens_q)
+
+ elif mode == "sage":
+ x = sageattn_varlen_wrapper(
+ q,
+ k,
+ v,
+ cu_seqlens_q,
+ cu_seqlens_kv,
+ max_seqlen_q,
+ max_seqlen_kv,
+ )
+ # x with shape [(bxs), a, d]
+ x = x.view(
+ batch_size, max_seqlen_q, x.shape[-2], x.shape[-1]
+ ) # reshape x to [b, s, a, d]
+
+ elif mode == "flash":
+ x = flash_attn_varlen_func(
+ q,
+ k,
+ v,
+ cu_seqlens_q,
+ cu_seqlens_kv,
+ max_seqlen_q,
+ max_seqlen_kv,
+ )
+ # x with shape [(bxs), a, d]
+ x = x.view(
+ batch_size, max_seqlen_q, x.shape[-2], x.shape[-1]
+ ) # reshape x to [b, s, a, d]
+ elif mode == "vanilla":
+ scale_factor = 1 / math.sqrt(q.size(-1))
+
+ b, a, s, _ = q.shape
+ s1 = k.size(2)
+ attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
+ if causal:
+ # Only applied to self attention
+ assert (
+ attn_mask is None
+ ), "Causal mask and attn_mask cannot be used together"
+ temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(
+ diagonal=0
+ )
+ attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+ attn_bias.to(q.dtype)
+
+ if attn_mask is not None:
+ if attn_mask.dtype == torch.bool:
+ attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+ else:
+ attn_bias += attn_mask
+
+ # TODO: Maybe force q and k to be float32 to avoid numerical overflow
+ attn = (q @ k.transpose(-2, -1)) * scale_factor
+ attn += attn_bias
+ attn = attn.softmax(dim=-1)
+ attn = torch.dropout(attn, p=drop_rate, train=True)
+ x = attn @ v
+ else:
+ raise NotImplementedError(f"Unsupported attention mode: {mode}")
+
+ x = post_attn_layout(x)
+ b, s, a, d = x.shape
+ out = x.reshape(b, s, -1)
+ if padding_length > 0 :
+ out = torch.cat([out, torch.empty( (out.shape[0], padding_length, out.shape[2]), dtype= out.dtype, device=out.device ) ], 1)
+
+ return out
+
+
+def parallel_attention(
+ hybrid_seq_parallel_attn,
+ q,
+ k,
+ v,
+ img_q_len,
+ img_kv_len,
+ cu_seqlens_q,
+ cu_seqlens_kv
+):
+ attn1 = hybrid_seq_parallel_attn(
+ None,
+ q[:, :img_q_len, :, :],
+ k[:, :img_kv_len, :, :],
+ v[:, :img_kv_len, :, :],
+ dropout_p=0.0,
+ causal=False,
+ joint_tensor_query=q[:,img_q_len:cu_seqlens_q[1]],
+ joint_tensor_key=k[:,img_kv_len:cu_seqlens_kv[1]],
+ joint_tensor_value=v[:,img_kv_len:cu_seqlens_kv[1]],
+ joint_strategy="rear",
+ )
+ if flash_attn.__version__ >= '2.7.0':
+ attn2, *_ = _flash_attn_forward(
+ q[:,cu_seqlens_q[1]:],
+ k[:,cu_seqlens_kv[1]:],
+ v[:,cu_seqlens_kv[1]:],
+ dropout_p=0.0,
+ softmax_scale=q.shape[-1] ** (-0.5),
+ causal=False,
+ window_size_left=-1,
+ window_size_right=-1,
+ softcap=0.0,
+ alibi_slopes=None,
+ return_softmax=False,
+ )
+ else:
+ attn2, *_ = _flash_attn_forward(
+ q[:,cu_seqlens_q[1]:],
+ k[:,cu_seqlens_kv[1]:],
+ v[:,cu_seqlens_kv[1]:],
+ dropout_p=0.0,
+ softmax_scale=q.shape[-1] ** (-0.5),
+ causal=False,
+ window_size=(-1, -1),
+ softcap=0.0,
+ alibi_slopes=None,
+ return_softmax=False,
+ )
+ attn = torch.cat([attn1, attn2], dim=1)
+ b, s, a, d = attn.shape
+ attn = attn.reshape(b, s, -1)
+
+ return attn
diff --git a/hyvideo/modules/embed_layers.py b/hyvideo/modules/embed_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d65ed1a43e9c20219a19a90d1ffa84a765a1872
--- /dev/null
+++ b/hyvideo/modules/embed_layers.py
@@ -0,0 +1,157 @@
+import math
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+
+from ..utils.helpers import to_2tuple
+
+
+class PatchEmbed(nn.Module):
+ """2D Image to Patch Embedding
+
+ Image to Patch Embedding using Conv2d
+
+ A convolution based approach to patchifying a 2D image w/ embedding projection.
+
+ Based on the impl in https://github.com/google-research/vision_transformer
+
+ Hacked together by / Copyright 2020 Ross Wightman
+
+ Remove the _assert function in forward function to be compatible with multi-resolution images.
+ """
+
+ def __init__(
+ self,
+ patch_size=16,
+ in_chans=3,
+ embed_dim=768,
+ norm_layer=None,
+ flatten=True,
+ bias=True,
+ dtype=None,
+ device=None,
+ ):
+ factory_kwargs = {"dtype": dtype, "device": device}
+ super().__init__()
+ patch_size = to_2tuple(patch_size)
+ self.patch_size = patch_size
+ self.flatten = flatten
+
+ self.proj = nn.Conv3d(
+ in_chans,
+ embed_dim,
+ kernel_size=patch_size,
+ stride=patch_size,
+ bias=bias,
+ **factory_kwargs
+ )
+ nn.init.xavier_uniform_(self.proj.weight.view(self.proj.weight.size(0), -1))
+ if bias:
+ nn.init.zeros_(self.proj.bias)
+
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+ def forward(self, x):
+ x = self.proj(x)
+ if self.flatten:
+ x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
+ x = self.norm(x)
+ return x
+
+
+class TextProjection(nn.Module):
+ """
+ Projects text embeddings. Also handles dropout for classifier-free guidance.
+
+ Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+ """
+
+ def __init__(self, in_channels, hidden_size, act_layer, dtype=None, device=None):
+ factory_kwargs = {"dtype": dtype, "device": device}
+ super().__init__()
+ self.linear_1 = nn.Linear(
+ in_features=in_channels,
+ out_features=hidden_size,
+ bias=True,
+ **factory_kwargs
+ )
+ self.act_1 = act_layer()
+ self.linear_2 = nn.Linear(
+ in_features=hidden_size,
+ out_features=hidden_size,
+ bias=True,
+ **factory_kwargs
+ )
+
+ def forward(self, caption):
+ hidden_states = self.linear_1(caption)
+ hidden_states = self.act_1(hidden_states)
+ hidden_states = self.linear_2(hidden_states)
+ return hidden_states
+
+
+def timestep_embedding(t, dim, max_period=10000):
+ """
+ Create sinusoidal timestep embeddings.
+
+ Args:
+ t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
+ dim (int): the dimension of the output.
+ max_period (int): controls the minimum frequency of the embeddings.
+
+ Returns:
+ embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.
+
+ .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+ """
+ half = dim // 2
+ freqs = torch.exp(
+ -math.log(max_period)
+ * torch.arange(start=0, end=half, dtype=torch.float32)
+ / half
+ ).to(device=t.device)
+ args = t[:, None].float() * freqs[None]
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+ if dim % 2:
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+ return embedding
+
+
+class TimestepEmbedder(nn.Module):
+ """
+ Embeds scalar timesteps into vector representations.
+ """
+
+ def __init__(
+ self,
+ hidden_size,
+ act_layer,
+ frequency_embedding_size=256,
+ max_period=10000,
+ out_size=None,
+ dtype=None,
+ device=None,
+ ):
+ factory_kwargs = {"dtype": dtype, "device": device}
+ super().__init__()
+ self.frequency_embedding_size = frequency_embedding_size
+ self.max_period = max_period
+ if out_size is None:
+ out_size = hidden_size
+
+ self.mlp = nn.Sequential(
+ nn.Linear(
+ frequency_embedding_size, hidden_size, bias=True, **factory_kwargs
+ ),
+ act_layer(),
+ nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
+ )
+ nn.init.normal_(self.mlp[0].weight, std=0.02)
+ nn.init.normal_(self.mlp[2].weight, std=0.02)
+
+ def forward(self, t):
+ t_freq = timestep_embedding(
+ t, self.frequency_embedding_size, self.max_period
+ ).type(self.mlp[0].weight.dtype)
+ t_emb = self.mlp(t_freq)
+ return t_emb
diff --git a/hyvideo/modules/mlp_layers.py b/hyvideo/modules/mlp_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fa53872bd40230c1bf317208108c20a984efa2a
--- /dev/null
+++ b/hyvideo/modules/mlp_layers.py
@@ -0,0 +1,131 @@
+# Modified from timm library:
+# https://github.com/huggingface/pytorch-image-models/blob/648aaa41233ba83eb38faf5ba9d415d574823241/timm/layers/mlp.py#L13
+
+from functools import partial
+
+import torch
+import torch.nn as nn
+
+from .modulate_layers import modulate_
+from ..utils.helpers import to_2tuple
+
+
+class MLP(nn.Module):
+ """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+ def __init__(
+ self,
+ in_channels,
+ hidden_channels=None,
+ out_features=None,
+ act_layer=nn.GELU,
+ norm_layer=None,
+ bias=True,
+ drop=0.0,
+ use_conv=False,
+ device=None,
+ dtype=None,
+ ):
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super().__init__()
+ out_features = out_features or in_channels
+ hidden_channels = hidden_channels or in_channels
+ bias = to_2tuple(bias)
+ drop_probs = to_2tuple(drop)
+ linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+
+ self.fc1 = linear_layer(
+ in_channels, hidden_channels, bias=bias[0], **factory_kwargs
+ )
+ self.act = act_layer()
+ self.drop1 = nn.Dropout(drop_probs[0])
+ self.norm = (
+ norm_layer(hidden_channels, **factory_kwargs)
+ if norm_layer is not None
+ else nn.Identity()
+ )
+ self.fc2 = linear_layer(
+ hidden_channels, out_features, bias=bias[1], **factory_kwargs
+ )
+ self.drop2 = nn.Dropout(drop_probs[1])
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop1(x)
+ x = self.norm(x)
+ x = self.fc2(x)
+ x = self.drop2(x)
+ return x
+
+ def apply_(self, x, divide = 4):
+ x_shape = x.shape
+ x = x.view(-1, x.shape[-1])
+ chunk_size = int(x_shape[1]/divide)
+ x_chunks = torch.split(x, chunk_size)
+ for i, x_chunk in enumerate(x_chunks):
+ mlp_chunk = self.fc1(x_chunk)
+ mlp_chunk = self.act(mlp_chunk)
+ mlp_chunk = self.drop1(mlp_chunk)
+ mlp_chunk = self.norm(mlp_chunk)
+ mlp_chunk = self.fc2(mlp_chunk)
+ x_chunk[...] = self.drop2(mlp_chunk)
+ return x
+
+#
+class MLPEmbedder(nn.Module):
+ """copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py"""
+ def __init__(self, in_dim: int, hidden_dim: int, device=None, dtype=None):
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super().__init__()
+ self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True, **factory_kwargs)
+ self.silu = nn.SiLU()
+ self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True, **factory_kwargs)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return self.out_layer(self.silu(self.in_layer(x)))
+
+
+class FinalLayer(nn.Module):
+ """The final layer of DiT."""
+
+ def __init__(
+ self, hidden_size, patch_size, out_channels, act_layer, device=None, dtype=None
+ ):
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super().__init__()
+
+ # Just use LayerNorm for the final layer
+ self.norm_final = nn.LayerNorm(
+ hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+ )
+ if isinstance(patch_size, int):
+ self.linear = nn.Linear(
+ hidden_size,
+ patch_size * patch_size * out_channels,
+ bias=True,
+ **factory_kwargs
+ )
+ else:
+ self.linear = nn.Linear(
+ hidden_size,
+ patch_size[0] * patch_size[1] * patch_size[2] * out_channels,
+ bias=True,
+ )
+ nn.init.zeros_(self.linear.weight)
+ nn.init.zeros_(self.linear.bias)
+
+ # Here we don't distinguish between the modulate types. Just use the simple one.
+ self.adaLN_modulation = nn.Sequential(
+ act_layer(),
+ nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
+ )
+ # Zero-initialize the modulation
+ nn.init.zeros_(self.adaLN_modulation[1].weight)
+ nn.init.zeros_(self.adaLN_modulation[1].bias)
+
+ def forward(self, x, c):
+ shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+ x = modulate_(self.norm_final(x), shift=shift, scale=scale)
+ x = self.linear(x)
+ return x
diff --git a/hyvideo/modules/models.py b/hyvideo/modules/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8276fd9d435233c48c0087de500a0b0ce9c395d
--- /dev/null
+++ b/hyvideo/modules/models.py
@@ -0,0 +1,1020 @@
+from typing import Any, List, Tuple, Optional, Union, Dict
+from einops import rearrange
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from diffusers.models import ModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+
+from .activation_layers import get_activation_layer
+from .norm_layers import get_norm_layer
+from .embed_layers import TimestepEmbedder, PatchEmbed, TextProjection
+from .attenion import attention, parallel_attention, get_cu_seqlens
+from .posemb_layers import apply_rotary_emb
+from .mlp_layers import MLP, MLPEmbedder, FinalLayer
+from .modulate_layers import ModulateDiT, modulate, modulate_ , apply_gate, apply_gate_and_accumulate_
+from .token_refiner import SingleTokenRefiner
+import numpy as np
+from mmgp import offload
+from wan.modules.attention import pay_attention
+
+def get_linear_split_map():
+ hidden_size = 3072
+ split_linear_modules_map = {
+ "img_attn_qkv" : {"mapped_modules" : ["img_attn_q", "img_attn_k", "img_attn_v"] , "split_sizes": [hidden_size, hidden_size, hidden_size]},
+ "linear1" : {"mapped_modules" : ["linear1_attn_q", "linear1_attn_k", "linear1_attn_v", "linear1_mlp"] , "split_sizes": [hidden_size, hidden_size, hidden_size, 7*hidden_size- 3*hidden_size]}
+ }
+ return split_linear_modules_map
+try:
+ from xformers.ops.fmha.attn_bias import BlockDiagonalPaddedKeysMask
+except ImportError:
+ BlockDiagonalPaddedKeysMask = None
+
+
+class MMDoubleStreamBlock(nn.Module):
+ """
+ A multimodal dit block with seperate modulation for
+ text and image/video, see more details (SD3): https://arxiv.org/abs/2403.03206
+ (Flux.1): https://github.com/black-forest-labs/flux
+ """
+
+ def __init__(
+ self,
+ hidden_size: int,
+ heads_num: int,
+ mlp_width_ratio: float,
+ mlp_act_type: str = "gelu_tanh",
+ qk_norm: bool = True,
+ qk_norm_type: str = "rms",
+ qkv_bias: bool = False,
+ dtype: Optional[torch.dtype] = None,
+ device: Optional[torch.device] = None,
+ attention_mode: str = "sdpa",
+ ):
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super().__init__()
+
+ self.attention_mode = attention_mode
+ self.deterministic = False
+ self.heads_num = heads_num
+ head_dim = hidden_size // heads_num
+ mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+
+ self.img_mod = ModulateDiT(
+ hidden_size,
+ factor=6,
+ act_layer=get_activation_layer("silu"),
+ **factory_kwargs,
+ )
+ self.img_norm1 = nn.LayerNorm(
+ hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+ )
+
+ self.img_attn_qkv = nn.Linear(
+ hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs
+ )
+ qk_norm_layer = get_norm_layer(qk_norm_type)
+ self.img_attn_q_norm = (
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+ if qk_norm
+ else nn.Identity()
+ )
+ self.img_attn_k_norm = (
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+ if qk_norm
+ else nn.Identity()
+ )
+ self.img_attn_proj = nn.Linear(
+ hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+ )
+
+ self.img_norm2 = nn.LayerNorm(
+ hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+ )
+ self.img_mlp = MLP(
+ hidden_size,
+ mlp_hidden_dim,
+ act_layer=get_activation_layer(mlp_act_type),
+ bias=True,
+ **factory_kwargs,
+ )
+
+ self.txt_mod = ModulateDiT(
+ hidden_size,
+ factor=6,
+ act_layer=get_activation_layer("silu"),
+ **factory_kwargs,
+ )
+ self.txt_norm1 = nn.LayerNorm(
+ hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+ )
+
+ self.txt_attn_qkv = nn.Linear(
+ hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs
+ )
+ self.txt_attn_q_norm = (
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+ if qk_norm
+ else nn.Identity()
+ )
+ self.txt_attn_k_norm = (
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+ if qk_norm
+ else nn.Identity()
+ )
+ self.txt_attn_proj = nn.Linear(
+ hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+ )
+
+ self.txt_norm2 = nn.LayerNorm(
+ hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+ )
+ self.txt_mlp = MLP(
+ hidden_size,
+ mlp_hidden_dim,
+ act_layer=get_activation_layer(mlp_act_type),
+ bias=True,
+ **factory_kwargs,
+ )
+ self.hybrid_seq_parallel_attn = None
+
+ def enable_deterministic(self):
+ self.deterministic = True
+
+ def disable_deterministic(self):
+ self.deterministic = False
+
+ def forward(
+ self,
+ img: torch.Tensor,
+ txt: torch.Tensor,
+ vec: torch.Tensor,
+ attn_mask = None,
+ seqlens_q: Optional[torch.Tensor] = None,
+ seqlens_kv: Optional[torch.Tensor] = None,
+ freqs_cis: tuple = None,
+ condition_type: str = None,
+ token_replace_vec: torch.Tensor = None,
+ frist_frame_token_num: int = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+ if condition_type == "token_replace":
+ img_mod1, token_replace_img_mod1 = self.img_mod(vec, condition_type=condition_type, \
+ token_replace_vec=token_replace_vec)
+ (img_mod1_shift,
+ img_mod1_scale,
+ img_mod1_gate,
+ img_mod2_shift,
+ img_mod2_scale,
+ img_mod2_gate) = img_mod1.chunk(6, dim=-1)
+ (tr_img_mod1_shift,
+ tr_img_mod1_scale,
+ tr_img_mod1_gate,
+ tr_img_mod2_shift,
+ tr_img_mod2_scale,
+ tr_img_mod2_gate) = token_replace_img_mod1.chunk(6, dim=-1)
+ else:
+ (
+ img_mod1_shift,
+ img_mod1_scale,
+ img_mod1_gate,
+ img_mod2_shift,
+ img_mod2_scale,
+ img_mod2_gate,
+ ) = self.img_mod(vec).chunk(6, dim=-1)
+ (
+ txt_mod1_shift,
+ txt_mod1_scale,
+ txt_mod1_gate,
+ txt_mod2_shift,
+ txt_mod2_scale,
+ txt_mod2_gate,
+ ) = self.txt_mod(vec).chunk(6, dim=-1)
+
+ ##### Enjoy this spagheti VRAM optimizations done by DeepBeepMeep !
+ # I am sure you are a nice person and as you copy this code, you will give me officially proper credits:
+ # Please link to https://github.com/deepbeepmeep/HunyuanVideoGP and @deepbeepmeep on twitter
+
+ # Prepare image for attention.
+ img_modulated = self.img_norm1(img)
+ img_modulated = img_modulated.to(torch.bfloat16)
+
+ if condition_type == "token_replace":
+ modulate_(img_modulated[:, :frist_frame_token_num], shift=tr_img_mod1_shift, scale=tr_img_mod1_scale)
+ modulate_(img_modulated[:, frist_frame_token_num:], shift=img_mod1_shift, scale=img_mod1_scale)
+ else:
+ modulate_( img_modulated, shift=img_mod1_shift, scale=img_mod1_scale )
+
+ shape = (*img_modulated.shape[:2], self.heads_num, int(img_modulated.shape[-1] / self.heads_num) )
+ img_q = self.img_attn_q(img_modulated).view(*shape)
+ img_k = self.img_attn_k(img_modulated).view(*shape)
+ img_v = self.img_attn_v(img_modulated).view(*shape)
+ del img_modulated
+
+ # Apply QK-Norm if needed
+ self.img_attn_q_norm.apply_(img_q).to(img_v)
+ img_q_len = img_q.shape[1]
+ self.img_attn_k_norm.apply_(img_k).to(img_v)
+ img_kv_len= img_k.shape[1]
+ batch_size = img_k.shape[0]
+ # Apply RoPE if needed.
+ qklist = [img_q, img_k]
+ del img_q, img_k
+ img_q, img_k = apply_rotary_emb(qklist, freqs_cis, head_first=False)
+ # Prepare txt for attention.
+ txt_modulated = self.txt_norm1(txt)
+ modulate_(txt_modulated, shift=txt_mod1_shift, scale=txt_mod1_scale )
+
+ txt_qkv = self.txt_attn_qkv(txt_modulated)
+ del txt_modulated
+ txt_q, txt_k, txt_v = rearrange(
+ txt_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num
+ )
+ del txt_qkv
+ # Apply QK-Norm if needed.
+ self.txt_attn_q_norm.apply_(txt_q).to(txt_v)
+ self.txt_attn_k_norm.apply_(txt_k).to(txt_v)
+
+ # Run actual attention.
+ q = torch.cat((img_q, txt_q), dim=1)
+ del img_q, txt_q
+ k = torch.cat((img_k, txt_k), dim=1)
+ del img_k, txt_k
+ v = torch.cat((img_v, txt_v), dim=1)
+ del img_v, txt_v
+
+ # attention computation start
+ qkv_list = [q,k,v]
+ del q, k, v
+
+ attn = pay_attention(
+ qkv_list,
+ attention_mask=attn_mask,
+ q_lens=seqlens_q,
+ k_lens=seqlens_kv,
+ )
+ b, s, a, d = attn.shape
+ attn = attn.reshape(b, s, -1)
+ del qkv_list
+
+ # attention computation end
+
+ img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1] :]
+ del attn
+ # Calculate the img bloks.
+
+ if condition_type == "token_replace":
+ img_attn = self.img_attn_proj(img_attn)
+ apply_gate_and_accumulate_(img[:, :frist_frame_token_num], img_attn[:, :frist_frame_token_num], gate=tr_img_mod1_gate)
+ apply_gate_and_accumulate_(img[:, frist_frame_token_num:], img_attn[:, frist_frame_token_num:], gate=img_mod1_gate)
+ del img_attn
+ img_modulated = self.img_norm2(img)
+ img_modulated = img_modulated.to(torch.bfloat16)
+ modulate_( img_modulated[:, :frist_frame_token_num], shift=tr_img_mod2_shift, scale=tr_img_mod2_scale)
+ modulate_( img_modulated[:, frist_frame_token_num:], shift=img_mod2_shift, scale=img_mod2_scale)
+ self.img_mlp.apply_(img_modulated)
+ apply_gate_and_accumulate_(img[:, :frist_frame_token_num], img_modulated[:, :frist_frame_token_num], gate=tr_img_mod2_gate)
+ apply_gate_and_accumulate_(img[:, frist_frame_token_num:], img_modulated[:, frist_frame_token_num:], gate=img_mod2_gate)
+ del img_modulated
+ else:
+ img_attn = self.img_attn_proj(img_attn)
+ apply_gate_and_accumulate_(img, img_attn, gate=img_mod1_gate)
+ del img_attn
+ img_modulated = self.img_norm2(img)
+ img_modulated = img_modulated.to(torch.bfloat16)
+ modulate_( img_modulated , shift=img_mod2_shift, scale=img_mod2_scale)
+ self.img_mlp.apply_(img_modulated)
+ apply_gate_and_accumulate_(img, img_modulated, gate=img_mod2_gate)
+ del img_modulated
+
+ # Calculate the txt bloks.
+ txt_attn = self.txt_attn_proj(txt_attn)
+ apply_gate_and_accumulate_(txt, txt_attn, gate=txt_mod1_gate)
+ del txt_attn
+ txt_modulated = self.txt_norm2(txt)
+ txt_modulated = txt_modulated.to(torch.bfloat16)
+ modulate_(txt_modulated, shift=txt_mod2_shift, scale=txt_mod2_scale)
+ txt_mlp = self.txt_mlp(txt_modulated)
+ del txt_modulated
+ apply_gate_and_accumulate_(txt, txt_mlp, gate=txt_mod2_gate)
+ return img, txt
+
+
+class MMSingleStreamBlock(nn.Module):
+ """
+ A DiT block with parallel linear layers as described in
+ https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+ Also refer to (SD3): https://arxiv.org/abs/2403.03206
+ (Flux.1): https://github.com/black-forest-labs/flux
+ """
+
+ def __init__(
+ self,
+ hidden_size: int,
+ heads_num: int,
+ mlp_width_ratio: float = 4.0,
+ mlp_act_type: str = "gelu_tanh",
+ qk_norm: bool = True,
+ qk_norm_type: str = "rms",
+ qk_scale: float = None,
+ dtype: Optional[torch.dtype] = None,
+ device: Optional[torch.device] = None,
+ attention_mode: str = "sdpa",
+ ):
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super().__init__()
+ self.attention_mode = attention_mode
+ self.deterministic = False
+ self.hidden_size = hidden_size
+ self.heads_num = heads_num
+ head_dim = hidden_size // heads_num
+ mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+ self.mlp_hidden_dim = mlp_hidden_dim
+ self.scale = qk_scale or head_dim ** -0.5
+
+ # qkv and mlp_in
+ self.linear1 = nn.Linear(
+ hidden_size, hidden_size * 3 + mlp_hidden_dim, **factory_kwargs
+ )
+ # proj and mlp_out
+ self.linear2 = nn.Linear(
+ hidden_size + mlp_hidden_dim, hidden_size, **factory_kwargs
+ )
+
+ qk_norm_layer = get_norm_layer(qk_norm_type)
+ self.q_norm = (
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+ if qk_norm
+ else nn.Identity()
+ )
+ self.k_norm = (
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+ if qk_norm
+ else nn.Identity()
+ )
+
+ self.pre_norm = nn.LayerNorm(
+ hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+ )
+
+ self.mlp_act = get_activation_layer(mlp_act_type)()
+ self.modulation = ModulateDiT(
+ hidden_size,
+ factor=3,
+ act_layer=get_activation_layer("silu"),
+ **factory_kwargs,
+ )
+ self.hybrid_seq_parallel_attn = None
+
+ def enable_deterministic(self):
+ self.deterministic = True
+
+ def disable_deterministic(self):
+ self.deterministic = False
+
+ def forward(
+ self,
+ # x: torch.Tensor,
+ img: torch.Tensor,
+ txt: torch.Tensor,
+ vec: torch.Tensor,
+ txt_len: int,
+ attn_mask= None,
+ seqlens_q: Optional[torch.Tensor] = None,
+ seqlens_kv: Optional[torch.Tensor] = None,
+ freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+ condition_type: str = None,
+ token_replace_vec: torch.Tensor = None,
+ frist_frame_token_num: int = None,
+ ) -> torch.Tensor:
+
+ ##### More spagheti VRAM optimizations done by DeepBeepMeep !
+ # I am sure you are a nice person and as you copy this code, you will give me proper credits:
+ # Please link to https://github.com/deepbeepmeep/HunyuanVideoGP and @deepbeepmeep on twitter
+
+ if condition_type == "token_replace":
+ mod, tr_mod = self.modulation(vec,
+ condition_type=condition_type,
+ token_replace_vec=token_replace_vec)
+ (mod_shift,
+ mod_scale,
+ mod_gate) = mod.chunk(3, dim=-1)
+ (tr_mod_shift,
+ tr_mod_scale,
+ tr_mod_gate) = tr_mod.chunk(3, dim=-1)
+ else:
+ mod_shift, mod_scale, mod_gate = self.modulation(vec).chunk(3, dim=-1)
+
+ img_mod = self.pre_norm(img)
+ img_mod = img_mod.to(torch.bfloat16)
+ if condition_type == "token_replace":
+ modulate_(img_mod[:, :frist_frame_token_num], shift=tr_mod_shift, scale=tr_mod_scale)
+ modulate_(img_mod[:, frist_frame_token_num:], shift=mod_shift, scale=mod_scale)
+ else:
+ modulate_(img_mod, shift=mod_shift, scale=mod_scale)
+ txt_mod = self.pre_norm(txt)
+ txt_mod = txt_mod.to(torch.bfloat16)
+ modulate_(txt_mod, shift=mod_shift, scale=mod_scale)
+
+ shape = (*img_mod.shape[:2], self.heads_num, int(img_mod.shape[-1] / self.heads_num) )
+ img_q = self.linear1_attn_q(img_mod).view(*shape)
+ img_k = self.linear1_attn_k(img_mod).view(*shape)
+ img_v = self.linear1_attn_v(img_mod).view(*shape)
+
+ shape = (*txt_mod.shape[:2], self.heads_num, int(txt_mod.shape[-1] / self.heads_num) )
+ txt_q = self.linear1_attn_q(txt_mod).view(*shape)
+ txt_k = self.linear1_attn_k(txt_mod).view(*shape)
+ txt_v = self.linear1_attn_v(txt_mod).view(*shape)
+
+ batch_size = img_mod.shape[0]
+
+ # Apply QK-Norm if needed.
+ # q = self.q_norm(q).to(v)
+ self.q_norm.apply_(img_q)
+ self.k_norm.apply_(img_k)
+ self.q_norm.apply_(txt_q)
+ self.k_norm.apply_(txt_k)
+
+ qklist = [img_q, img_k]
+ del img_q, img_k
+ img_q, img_k = apply_rotary_emb(qklist, freqs_cis, head_first=False)
+ img_q_len=img_q.shape[1]
+ q = torch.cat((img_q, txt_q), dim=1)
+ del img_q, txt_q
+ k = torch.cat((img_k, txt_k), dim=1)
+ img_kv_len=img_k.shape[1]
+ del img_k, txt_k
+
+ v = torch.cat((img_v, txt_v), dim=1)
+ del img_v, txt_v
+
+ # attention computation start
+ qkv_list = [q,k,v]
+ del q, k, v
+ attn = pay_attention(
+ qkv_list,
+ attention_mask=attn_mask,
+ q_lens = seqlens_q,
+ k_lens = seqlens_kv,
+ )
+ b, s, a, d = attn.shape
+ attn = attn.reshape(b, s, -1)
+ del qkv_list
+ # attention computation end
+
+ x_mod = torch.cat((img_mod, txt_mod), 1)
+ del img_mod, txt_mod
+ x_mod_shape = x_mod.shape
+ x_mod = x_mod.view(-1, x_mod.shape[-1])
+ chunk_size = int(x_mod_shape[1]/6)
+ x_chunks = torch.split(x_mod, chunk_size)
+ attn = attn.view(-1, attn.shape[-1])
+ attn_chunks =torch.split(attn, chunk_size)
+ for x_chunk, attn_chunk in zip(x_chunks, attn_chunks):
+ mlp_chunk = self.linear1_mlp(x_chunk)
+ mlp_chunk = self.mlp_act(mlp_chunk)
+ attn_mlp_chunk = torch.cat((attn_chunk, mlp_chunk), -1)
+ del attn_chunk, mlp_chunk
+ x_chunk[...] = self.linear2(attn_mlp_chunk)
+ del attn_mlp_chunk
+ x_mod = x_mod.view(x_mod_shape)
+
+ if condition_type == "token_replace":
+ apply_gate_and_accumulate_(img[:, :frist_frame_token_num, :], x_mod[:, :frist_frame_token_num, :], gate=tr_mod_gate)
+ apply_gate_and_accumulate_(img[:, frist_frame_token_num:, :], x_mod[:, frist_frame_token_num:-txt_len, :], gate=mod_gate)
+ else:
+ apply_gate_and_accumulate_(img, x_mod[:, :-txt_len, :], gate=mod_gate)
+
+ apply_gate_and_accumulate_(txt, x_mod[:, -txt_len:, :], gate=mod_gate)
+
+ return img, txt
+
+class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
+ @staticmethod
+ def preprocess_loras(model_filename, sd):
+ if not "i2v" in model_filename:
+ return sd
+ new_sd = {}
+ for k,v in sd.items():
+ repl_list = ["double_blocks", "single_blocks", "final_layer", "img_mlp", "img_attn_qkv", "img_attn_proj","img_mod", "txt_mlp", "txt_attn_qkv","txt_attn_proj", "txt_mod", "linear1",
+ "linear2", "modulation", "mlp_fc1"]
+ src_list = [k +"_" for k in repl_list] + ["_" + k for k in repl_list]
+ tgt_list = [k +"." for k in repl_list] + ["." + k for k in repl_list]
+ if k.startswith("Hunyuan_video_I2V_lora_"):
+ # crappy conversion script for non reversible lora naming
+ k = k.replace("Hunyuan_video_I2V_lora_","diffusion_model.")
+ k = k.replace("lora_up","lora_B")
+ k = k.replace("lora_down","lora_A")
+ if "txt_in_individual" in k:
+ pass
+ for s,t in zip(src_list, tgt_list):
+ k = k.replace(s,t)
+ if "individual_token_refiner" in k:
+ k = k.replace("txt_in_individual_token_refiner_blocks_", "txt_in.individual_token_refiner.blocks.")
+ k = k.replace("_mlp_fc", ".mlp.fc",)
+ k = k.replace(".mlp_fc", ".mlp.fc",)
+ new_sd[k] = v
+ return new_sd
+ """
+ HunyuanVideo Transformer backbone
+
+ Inherited from ModelMixin and ConfigMixin for compatibility with diffusers' sampler StableDiffusionPipeline.
+
+ Reference:
+ [1] Flux.1: https://github.com/black-forest-labs/flux
+ [2] MMDiT: http://arxiv.org/abs/2403.03206
+
+ Parameters
+ ----------
+ args: argparse.Namespace
+ The arguments parsed by argparse.
+ patch_size: list
+ The size of the patch.
+ in_channels: int
+ The number of input channels.
+ out_channels: int
+ The number of output channels.
+ hidden_size: int
+ The hidden size of the transformer backbone.
+ heads_num: int
+ The number of attention heads.
+ mlp_width_ratio: float
+ The ratio of the hidden size of the MLP in the transformer block.
+ mlp_act_type: str
+ The activation function of the MLP in the transformer block.
+ depth_double_blocks: int
+ The number of transformer blocks in the double blocks.
+ depth_single_blocks: int
+ The number of transformer blocks in the single blocks.
+ rope_dim_list: list
+ The dimension of the rotary embedding for t, h, w.
+ qkv_bias: bool
+ Whether to use bias in the qkv linear layer.
+ qk_norm: bool
+ Whether to use qk norm.
+ qk_norm_type: str
+ The type of qk norm.
+ guidance_embed: bool
+ Whether to use guidance embedding for distillation.
+ text_projection: str
+ The type of the text projection, default is single_refiner.
+ use_attention_mask: bool
+ Whether to use attention mask for text encoder.
+ dtype: torch.dtype
+ The dtype of the model.
+ device: torch.device
+ The device of the model.
+ """
+
+ @register_to_config
+ def __init__(
+ self,
+ i2v_condition_type,
+ patch_size: list = [1, 2, 2],
+ in_channels: int = 4, # Should be VAE.config.latent_channels.
+ out_channels: int = None,
+ hidden_size: int = 3072,
+ heads_num: int = 24,
+ mlp_width_ratio: float = 4.0,
+ mlp_act_type: str = "gelu_tanh",
+ mm_double_blocks_depth: int = 20,
+ mm_single_blocks_depth: int = 40,
+ rope_dim_list: List[int] = [16, 56, 56],
+ qkv_bias: bool = True,
+ qk_norm: bool = True,
+ qk_norm_type: str = "rms",
+ guidance_embed: bool = False, # For modulation.
+ text_projection: str = "single_refiner",
+ use_attention_mask: bool = True,
+ dtype: Optional[torch.dtype] = None,
+ device: Optional[torch.device] = None,
+ attention_mode: Optional[str] = "sdpa"
+ ):
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super().__init__()
+
+ # mm_double_blocks_depth , mm_single_blocks_depth = 5, 5
+
+ self.patch_size = patch_size
+ self.in_channels = in_channels
+ self.out_channels = in_channels if out_channels is None else out_channels
+ self.unpatchify_channels = self.out_channels
+ self.guidance_embed = guidance_embed
+ self.rope_dim_list = rope_dim_list
+ self.i2v_condition_type = i2v_condition_type
+ self.attention_mode = attention_mode
+
+ # Text projection. Default to linear projection.
+ # Alternative: TokenRefiner. See more details (LI-DiT): http://arxiv.org/abs/2406.11831
+ self.use_attention_mask = use_attention_mask
+ self.text_projection = text_projection
+
+ self.text_states_dim = 4096
+ self.text_states_dim_2 = 768
+
+ if hidden_size % heads_num != 0:
+ raise ValueError(
+ f"Hidden size {hidden_size} must be divisible by heads_num {heads_num}"
+ )
+ pe_dim = hidden_size // heads_num
+ if sum(rope_dim_list) != pe_dim:
+ raise ValueError(
+ f"Got {rope_dim_list} but expected positional dim {pe_dim}"
+ )
+ self.hidden_size = hidden_size
+ self.heads_num = heads_num
+
+ # image projection
+ self.img_in = PatchEmbed(
+ self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs
+ )
+
+ # text projection
+ if self.text_projection == "linear":
+ self.txt_in = TextProjection(
+ self.text_states_dim,
+ self.hidden_size,
+ get_activation_layer("silu"),
+ **factory_kwargs,
+ )
+ elif self.text_projection == "single_refiner":
+ self.txt_in = SingleTokenRefiner(
+ self.text_states_dim, hidden_size, heads_num, depth=2, **factory_kwargs
+ )
+ else:
+ raise NotImplementedError(
+ f"Unsupported text_projection: {self.text_projection}"
+ )
+
+ # time modulation
+ self.time_in = TimestepEmbedder(
+ self.hidden_size, get_activation_layer("silu"), **factory_kwargs
+ )
+
+ # text modulation
+ self.vector_in = MLPEmbedder(
+ self.text_states_dim_2, self.hidden_size, **factory_kwargs
+ )
+
+ # guidance modulation
+ self.guidance_in = (
+ TimestepEmbedder(
+ self.hidden_size, get_activation_layer("silu"), **factory_kwargs
+ )
+ if guidance_embed
+ else None
+ )
+
+ # double blocks
+ self.double_blocks = nn.ModuleList(
+ [
+ MMDoubleStreamBlock(
+ self.hidden_size,
+ self.heads_num,
+ mlp_width_ratio=mlp_width_ratio,
+ mlp_act_type=mlp_act_type,
+ qk_norm=qk_norm,
+ qk_norm_type=qk_norm_type,
+ qkv_bias=qkv_bias,
+ attention_mode = attention_mode,
+ **factory_kwargs,
+ )
+ for _ in range(mm_double_blocks_depth)
+ ]
+ )
+
+ # single blocks
+ self.single_blocks = nn.ModuleList(
+ [
+ MMSingleStreamBlock(
+ self.hidden_size,
+ self.heads_num,
+ mlp_width_ratio=mlp_width_ratio,
+ mlp_act_type=mlp_act_type,
+ qk_norm=qk_norm,
+ qk_norm_type=qk_norm_type,
+ attention_mode = attention_mode,
+ **factory_kwargs,
+ )
+ for _ in range(mm_single_blocks_depth)
+ ]
+ )
+
+ self.final_layer = FinalLayer(
+ self.hidden_size,
+ self.patch_size,
+ self.out_channels,
+ get_activation_layer("silu"),
+ **factory_kwargs,
+ )
+
+
+ def lock_layers_dtypes(self, dtype = torch.float32):
+ layer_list = [self.final_layer, self.final_layer.linear, self.final_layer.adaLN_modulation[1]]
+ target_dype= dtype
+
+ for current_layer_list, current_dtype in zip([layer_list], [target_dype]):
+ for layer in current_layer_list:
+ layer._lock_dtype = dtype
+
+ if hasattr(layer, "weight") and layer.weight.dtype != current_dtype :
+ layer.weight.data = layer.weight.data.to(current_dtype)
+ if hasattr(layer, "bias"):
+ layer.bias.data = layer.bias.data.to(current_dtype)
+
+ self._lock_dtype = dtype
+
+ def enable_deterministic(self):
+ for block in self.double_blocks:
+ block.enable_deterministic()
+ for block in self.single_blocks:
+ block.enable_deterministic()
+
+ def disable_deterministic(self):
+ for block in self.double_blocks:
+ block.disable_deterministic()
+ for block in self.single_blocks:
+ block.disable_deterministic()
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ t: torch.Tensor, # Should be in range(0, 1000).
+ ref_latents: torch.Tensor=None,
+ text_states: torch.Tensor = None,
+ text_mask: torch.Tensor = None, # Now we don't use it.
+ text_states_2: Optional[torch.Tensor] = None, # Text embedding for modulation.
+ freqs_cos: Optional[torch.Tensor] = None,
+ freqs_sin: Optional[torch.Tensor] = None,
+ guidance: torch.Tensor = None, # Guidance for modulation, should be cfg_scale x 1000.
+ pipeline=None,
+ x_id = 0,
+ callback = None,
+ ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+
+ img = x
+ batch_no, _, ot, oh, ow = x.shape
+ del x
+ txt = text_states
+ tt, th, tw = (
+ ot // self.patch_size[0],
+ oh // self.patch_size[1],
+ ow // self.patch_size[2],
+ )
+
+ # Prepare modulation vectors.
+ vec = self.time_in(t)
+
+ if self.i2v_condition_type == "token_replace":
+ token_replace_t = torch.zeros_like(t)
+ token_replace_vec = self.time_in(token_replace_t)
+ frist_frame_token_num = th * tw
+ else:
+ token_replace_vec = None
+ frist_frame_token_num = None
+ # token_replace_mask_img = None
+ # token_replace_mask_txt = None
+
+ # text modulation
+ # vec = vec + self.vector_in(text_states_2)
+ vec_2 = self.vector_in(text_states_2)
+ del text_states_2
+ vec += vec_2
+ if self.i2v_condition_type == "token_replace":
+ token_replace_vec += vec_2
+ del vec_2
+
+ # guidance modulation
+ if self.guidance_embed:
+ if guidance is None:
+ raise ValueError(
+ "Didn't get guidance strength for guidance distilled model."
+ )
+
+ # our timestep_embedding is merged into guidance_in(TimestepEmbedder)
+ vec = vec + self.guidance_in(guidance)
+
+ # Embed image and text.
+ img = self.img_in(img)
+ if ref_latents != None:
+ ref_latents = self.img_in(ref_latents)
+ if self.text_projection == "linear":
+ txt = self.txt_in(txt)
+ elif self.text_projection == "single_refiner":
+ txt = self.txt_in(txt, t, text_mask if self.use_attention_mask else None)
+ else:
+ raise NotImplementedError(
+ f"Unsupported text_projection: {self.text_projection}"
+ )
+
+ if ref_latents == None:
+ ref_length = None
+ else:
+ ref_length = ref_latents.shape[-2]
+ img = torch.cat([ref_latents, img], dim=-2) # t c
+ txt_seq_len = txt.shape[1]
+ img_seq_len = img.shape[1]
+
+ text_len = text_mask.sum(1)
+ total_len = text_len + img_seq_len
+ seqlens_q = seqlens_kv = total_len
+ attn_mask = None
+
+ freqs_cis = (freqs_cos, freqs_sin) if freqs_cos is not None else None
+
+
+ if self.enable_teacache:
+ if x_id == 0:
+ self.should_calc = True
+ inp = img[0:1]
+ vec_ = vec
+ ( img_mod1_shift, img_mod1_scale, _ , _ , _ , _ , ) = self.double_blocks[0].img_mod(vec_).chunk(6, dim=-1)
+ normed_inp = self.double_blocks[0].img_norm1(inp)
+ normed_inp = normed_inp.to(torch.bfloat16)
+ modulated_inp = modulate( normed_inp, shift=img_mod1_shift, scale=img_mod1_scale )
+ del normed_inp, img_mod1_shift, img_mod1_scale
+ if self.teacache_counter <= self.teacache_start_step or self.teacache_counter == self.num_steps-1:
+ self.accumulated_rel_l1_distance = 0
+ else:
+ coefficients = [7.33226126e+02, -4.01131952e+02, 6.75869174e+01, -3.14987800e+00, 9.61237896e-02]
+ rescale_func = np.poly1d(coefficients)
+ self.accumulated_rel_l1_distance += rescale_func(((modulated_inp-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item())
+ if self.accumulated_rel_l1_distance < self.rel_l1_thresh:
+ self.should_calc = False
+ self.teacache_skipped_steps += 1
+ else:
+ self.accumulated_rel_l1_distance = 0
+ self.previous_modulated_input = modulated_inp
+ self.teacache_counter += 1
+ if self.teacache_counter == self.num_steps:
+ self.teacache_counter = 0
+ else:
+ self.should_calc = True
+
+ if not self.should_calc:
+ img += self.previous_residual[x_id]
+ else:
+ if self.enable_teacache:
+ self.previous_residual[x_id] = None
+ ori_img = img[0:1].clone()
+ # --------------------- Pass through DiT blocks ------------------------
+ for _, block in enumerate(self.double_blocks):
+ for i in range(len(img)):
+ if callback != None:
+ callback(-1, None, False, True)
+ if pipeline._interrupt:
+ return None
+ double_block_args = [
+ img[i:i+1],
+ txt[i:i+1],
+ vec[i:i+1],
+ attn_mask,
+ seqlens_q[i:i+1],
+ seqlens_kv[i:i+1],
+ freqs_cis,
+ self.i2v_condition_type,
+ token_replace_vec,
+ frist_frame_token_num,
+ ]
+
+ img[i], txt[i] = block(*double_block_args)
+ double_block_args = None
+
+ for _, block in enumerate(self.single_blocks):
+ for i in range(len(img)):
+ if callback != None:
+ callback(-1, None, False, True)
+ if pipeline._interrupt:
+ return None
+ single_block_args = [
+ # x,
+ img[i:i+1],
+ txt[i:i+1],
+ vec[i:i+1],
+ txt_seq_len,
+ attn_mask,
+ seqlens_q[i:i+1],
+ seqlens_kv[i:i+1],
+ (freqs_cos, freqs_sin),
+ self.i2v_condition_type,
+ token_replace_vec,
+ frist_frame_token_num,
+ ]
+
+ img[i], txt[i] = block(*single_block_args)
+ single_block_args = None
+
+ # img = x[:, :img_seq_len, ...]
+ if self.enable_teacache:
+ if len(img) > 1:
+ self.previous_residual[0] = torch.empty_like(img)
+ for i, (x, residual) in enumerate(zip(img, self.previous_residual[0])):
+ if i < len(img) - 1:
+ residual[...] = torch.sub(x, ori_img)
+ else:
+ residual[...] = ori_img
+ torch.sub(x, ori_img, out=residual)
+ x = None
+ else:
+ self.previous_residual[x_id] = ori_img
+ torch.sub(img, ori_img, out=self.previous_residual[x_id])
+
+
+ if ref_length != None:
+ img = img[:, ref_length:]
+ # ---------------------------- Final layer ------------------------------
+ out_dtype = self.final_layer.linear.weight.dtype
+ vec = vec.to(out_dtype)
+ img_list = []
+ for img_chunk, vec_chunk in zip(img,vec):
+ img_list.append( self.final_layer(img_chunk.to(out_dtype).unsqueeze(0), vec_chunk.unsqueeze(0))) # (N, T, patch_size ** 2 * out_channels)
+ img = torch.cat(img_list)
+ img_list = None
+
+ img = self.unpatchify(img, tt, th, tw)
+
+ return img
+
+ def unpatchify(self, x, t, h, w):
+ """
+ x: (N, T, patch_size**2 * C)
+ imgs: (N, H, W, C)
+ """
+ c = self.unpatchify_channels
+ pt, ph, pw = self.patch_size
+ assert t * h * w == x.shape[1]
+
+ x = x.reshape(shape=(x.shape[0], t, h, w, c, pt, ph, pw))
+ x = torch.einsum("nthwcopq->nctohpwq", x)
+ imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
+
+ return imgs
+
+ def params_count(self):
+ counts = {
+ "double": sum(
+ [
+ sum(p.numel() for p in block.img_attn_qkv.parameters())
+ + sum(p.numel() for p in block.img_attn_proj.parameters())
+ + sum(p.numel() for p in block.img_mlp.parameters())
+ + sum(p.numel() for p in block.txt_attn_qkv.parameters())
+ + sum(p.numel() for p in block.txt_attn_proj.parameters())
+ + sum(p.numel() for p in block.txt_mlp.parameters())
+ for block in self.double_blocks
+ ]
+ ),
+ "single": sum(
+ [
+ sum(p.numel() for p in block.linear1.parameters())
+ + sum(p.numel() for p in block.linear2.parameters())
+ for block in self.single_blocks
+ ]
+ ),
+ "total": sum(p.numel() for p in self.parameters()),
+ }
+ counts["attn+mlp"] = counts["double"] + counts["single"]
+ return counts
+
+
+#################################################################################
+# HunyuanVideo Configs #
+#################################################################################
+
+HUNYUAN_VIDEO_CONFIG = {
+ "HYVideo-T/2": {
+ "mm_double_blocks_depth": 20,
+ "mm_single_blocks_depth": 40,
+ "rope_dim_list": [16, 56, 56],
+ "hidden_size": 3072,
+ "heads_num": 24,
+ "mlp_width_ratio": 4,
+ },
+ "HYVideo-T/2-cfgdistill": {
+ "mm_double_blocks_depth": 20,
+ "mm_single_blocks_depth": 40,
+ "rope_dim_list": [16, 56, 56],
+ "hidden_size": 3072,
+ "heads_num": 24,
+ "mlp_width_ratio": 4,
+ "guidance_embed": True,
+ },
+ "HYVideo-S/2": {
+ "mm_double_blocks_depth": 6,
+ "mm_single_blocks_depth": 12,
+ "rope_dim_list": [12, 42, 42],
+ "hidden_size": 480,
+ "heads_num": 5,
+ "mlp_width_ratio": 4,
+ },
+ 'HYVideo-T/2-custom': { # 9.0B / 12.5B
+ "mm_double_blocks_depth": 20,
+ "mm_single_blocks_depth": 40,
+ "rope_dim_list": [16, 56, 56],
+ "hidden_size": 3072,
+ "heads_num": 24,
+ "mlp_width_ratio": 4,
+ },
+
+}
\ No newline at end of file
diff --git a/hyvideo/modules/modulate_layers.py b/hyvideo/modules/modulate_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..df1cf602ec9e215c4277991e952fb9aff4cefa65
--- /dev/null
+++ b/hyvideo/modules/modulate_layers.py
@@ -0,0 +1,136 @@
+from typing import Callable
+
+import torch
+import torch.nn as nn
+import math
+
+class ModulateDiT(nn.Module):
+ """Modulation layer for DiT."""
+ def __init__(
+ self,
+ hidden_size: int,
+ factor: int,
+ act_layer: Callable,
+ dtype=None,
+ device=None,
+ ):
+ factory_kwargs = {"dtype": dtype, "device": device}
+ super().__init__()
+ self.act = act_layer()
+ self.linear = nn.Linear(
+ hidden_size, factor * hidden_size, bias=True, **factory_kwargs
+ )
+ # Zero-initialize the modulation
+ nn.init.zeros_(self.linear.weight)
+ nn.init.zeros_(self.linear.bias)
+
+ def forward(self, x: torch.Tensor, condition_type=None, token_replace_vec=None) -> torch.Tensor:
+ x_out = self.linear(self.act(x))
+
+ if condition_type == "token_replace":
+ x_token_replace_out = self.linear(self.act(token_replace_vec))
+ return x_out, x_token_replace_out
+ else:
+ return x_out
+
+def modulate(x, shift=None, scale=None):
+ """modulate by shift and scale
+
+ Args:
+ x (torch.Tensor): input tensor.
+ shift (torch.Tensor, optional): shift tensor. Defaults to None.
+ scale (torch.Tensor, optional): scale tensor. Defaults to None.
+
+ Returns:
+ torch.Tensor: the output tensor after modulate.
+ """
+ if scale is None and shift is None:
+ return x
+ elif shift is None:
+ return x * (1 + scale.unsqueeze(1))
+ elif scale is None:
+ return x + shift.unsqueeze(1)
+ else:
+ return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+def modulate_(x, shift=None, scale=None):
+
+ if scale is None and shift is None:
+ return x
+ elif shift is None:
+ scale = scale + 1
+ scale = scale.unsqueeze(1)
+ return x.mul_(scale)
+ elif scale is None:
+ return x + shift.unsqueeze(1)
+ else:
+ scale = scale + 1
+ scale = scale.unsqueeze(1)
+ # return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+ torch.addcmul(shift.unsqueeze(1), x, scale, out =x )
+ return x
+
+def modulate(x, shift=None, scale=None, condition_type=None,
+ tr_shift=None, tr_scale=None,
+ frist_frame_token_num=None):
+ if condition_type == "token_replace":
+ x_zero = x[:, :frist_frame_token_num] * (1 + tr_scale.unsqueeze(1)) + tr_shift.unsqueeze(1)
+ x_orig = x[:, frist_frame_token_num:] * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+ x = torch.concat((x_zero, x_orig), dim=1)
+ return x
+ else:
+ if scale is None and shift is None:
+ return x
+ elif shift is None:
+ return x * (1 + scale.unsqueeze(1))
+ elif scale is None:
+ return x + shift.unsqueeze(1)
+ else:
+ return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+def apply_gate(x, gate=None, tanh=False, condition_type=None, tr_gate=None, frist_frame_token_num=None):
+ """AI is creating summary for apply_gate
+
+ Args:
+ x (torch.Tensor): input tensor.
+ gate (torch.Tensor, optional): gate tensor. Defaults to None.
+ tanh (bool, optional): whether to use tanh function. Defaults to False.
+
+ Returns:
+ torch.Tensor: the output tensor after apply gate.
+ """
+ if condition_type == "token_replace":
+ if gate is None:
+ return x
+ if tanh:
+ x_zero = x[:, :frist_frame_token_num] * tr_gate.unsqueeze(1).tanh()
+ x_orig = x[:, frist_frame_token_num:] * gate.unsqueeze(1).tanh()
+ x = torch.concat((x_zero, x_orig), dim=1)
+ return x
+ else:
+ x_zero = x[:, :frist_frame_token_num] * tr_gate.unsqueeze(1)
+ x_orig = x[:, frist_frame_token_num:] * gate.unsqueeze(1)
+ x = torch.concat((x_zero, x_orig), dim=1)
+ return x
+ else:
+ if gate is None:
+ return x
+ if tanh:
+ return x * gate.unsqueeze(1).tanh()
+ else:
+ return x * gate.unsqueeze(1)
+
+def apply_gate_and_accumulate_(accumulator, x, gate=None, tanh=False):
+ if gate is None:
+ return accumulator
+ if tanh:
+ return accumulator.addcmul_(x, gate.unsqueeze(1).tanh())
+ else:
+ return accumulator.addcmul_(x, gate.unsqueeze(1))
+
+def ckpt_wrapper(module):
+ def ckpt_forward(*inputs):
+ outputs = module(*inputs)
+ return outputs
+
+ return ckpt_forward
diff --git a/hyvideo/modules/norm_layers.py b/hyvideo/modules/norm_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..baed267717ff325126ae692855c16c568e8fb2c5
--- /dev/null
+++ b/hyvideo/modules/norm_layers.py
@@ -0,0 +1,88 @@
+import torch
+import torch.nn as nn
+
+
+class RMSNorm(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ elementwise_affine=True,
+ eps: float = 1e-6,
+ device=None,
+ dtype=None,
+ ):
+ """
+ Initialize the RMSNorm normalization layer.
+
+ Args:
+ dim (int): The dimension of the input tensor.
+ eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+
+ Attributes:
+ eps (float): A small value added to the denominator for numerical stability.
+ weight (nn.Parameter): Learnable scaling parameter.
+
+ """
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super().__init__()
+ self.eps = eps
+ if elementwise_affine:
+ self.weight = nn.Parameter(torch.ones(dim, **factory_kwargs))
+
+ def _norm(self, x):
+ """
+ Apply the RMSNorm normalization to the input tensor.
+
+ Args:
+ x (torch.Tensor): The input tensor.
+
+ Returns:
+ torch.Tensor: The normalized tensor.
+
+ """
+
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+ def forward(self, x):
+ """
+ Forward pass through the RMSNorm layer.
+
+ Args:
+ x (torch.Tensor): The input tensor.
+
+ Returns:
+ torch.Tensor: The output tensor after applying RMSNorm.
+
+ """
+ output = self._norm(x.float()).type_as(x)
+ if hasattr(self, "weight"):
+ output = output * self.weight
+ return output
+
+ def apply_(self, x):
+ y = x.pow(2).mean(-1, keepdim=True)
+ y.add_(self.eps)
+ y.rsqrt_()
+ x.mul_(y)
+ del y
+ if hasattr(self, "weight"):
+ x.mul_(self.weight)
+ return x
+
+
+def get_norm_layer(norm_layer):
+ """
+ Get the normalization layer.
+
+ Args:
+ norm_layer (str): The type of normalization layer.
+
+ Returns:
+ norm_layer (nn.Module): The normalization layer.
+ """
+ if norm_layer == "layer":
+ return nn.LayerNorm
+ elif norm_layer == "rms":
+ return RMSNorm
+ else:
+ raise NotImplementedError(f"Norm layer {norm_layer} is not implemented")
diff --git a/hyvideo/modules/original models.py b/hyvideo/modules/original models.py
new file mode 100644
index 0000000000000000000000000000000000000000..646a42d03a35300cf3b3d57aa17b72278f20b3f2
--- /dev/null
+++ b/hyvideo/modules/original models.py
@@ -0,0 +1,760 @@
+from typing import Any, List, Tuple, Optional, Union, Dict
+from einops import rearrange
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from diffusers.models import ModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+
+from .activation_layers import get_activation_layer
+from .norm_layers import get_norm_layer
+from .embed_layers import TimestepEmbedder, PatchEmbed, TextProjection
+from .attenion import attention, parallel_attention, get_cu_seqlens
+from .posemb_layers import apply_rotary_emb
+from .mlp_layers import MLP, MLPEmbedder, FinalLayer
+from .modulate_layers import ModulateDiT, modulate, apply_gate
+from .token_refiner import SingleTokenRefiner
+
+
+class MMDoubleStreamBlock(nn.Module):
+ """
+ A multimodal dit block with seperate modulation for
+ text and image/video, see more details (SD3): https://arxiv.org/abs/2403.03206
+ (Flux.1): https://github.com/black-forest-labs/flux
+ """
+
+ def __init__(
+ self,
+ hidden_size: int,
+ heads_num: int,
+ mlp_width_ratio: float,
+ mlp_act_type: str = "gelu_tanh",
+ qk_norm: bool = True,
+ qk_norm_type: str = "rms",
+ qkv_bias: bool = False,
+ dtype: Optional[torch.dtype] = None,
+ device: Optional[torch.device] = None,
+ ):
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super().__init__()
+
+ self.deterministic = False
+ self.heads_num = heads_num
+ head_dim = hidden_size // heads_num
+ mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+
+ self.img_mod = ModulateDiT(
+ hidden_size,
+ factor=6,
+ act_layer=get_activation_layer("silu"),
+ **factory_kwargs,
+ )
+ self.img_norm1 = nn.LayerNorm(
+ hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+ )
+
+ self.img_attn_qkv = nn.Linear(
+ hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs
+ )
+ qk_norm_layer = get_norm_layer(qk_norm_type)
+ self.img_attn_q_norm = (
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+ if qk_norm
+ else nn.Identity()
+ )
+ self.img_attn_k_norm = (
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+ if qk_norm
+ else nn.Identity()
+ )
+ self.img_attn_proj = nn.Linear(
+ hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+ )
+
+ self.img_norm2 = nn.LayerNorm(
+ hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+ )
+ self.img_mlp = MLP(
+ hidden_size,
+ mlp_hidden_dim,
+ act_layer=get_activation_layer(mlp_act_type),
+ bias=True,
+ **factory_kwargs,
+ )
+
+ self.txt_mod = ModulateDiT(
+ hidden_size,
+ factor=6,
+ act_layer=get_activation_layer("silu"),
+ **factory_kwargs,
+ )
+ self.txt_norm1 = nn.LayerNorm(
+ hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+ )
+
+ self.txt_attn_qkv = nn.Linear(
+ hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs
+ )
+ self.txt_attn_q_norm = (
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+ if qk_norm
+ else nn.Identity()
+ )
+ self.txt_attn_k_norm = (
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+ if qk_norm
+ else nn.Identity()
+ )
+ self.txt_attn_proj = nn.Linear(
+ hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+ )
+
+ self.txt_norm2 = nn.LayerNorm(
+ hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+ )
+ self.txt_mlp = MLP(
+ hidden_size,
+ mlp_hidden_dim,
+ act_layer=get_activation_layer(mlp_act_type),
+ bias=True,
+ **factory_kwargs,
+ )
+ self.hybrid_seq_parallel_attn = None
+
+ def enable_deterministic(self):
+ self.deterministic = True
+
+ def disable_deterministic(self):
+ self.deterministic = False
+
+ def forward(
+ self,
+ img: torch.Tensor,
+ txt: torch.Tensor,
+ vec: torch.Tensor,
+ cu_seqlens_q: Optional[torch.Tensor] = None,
+ cu_seqlens_kv: Optional[torch.Tensor] = None,
+ max_seqlen_q: Optional[int] = None,
+ max_seqlen_kv: Optional[int] = None,
+ freqs_cis: tuple = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ (
+ img_mod1_shift,
+ img_mod1_scale,
+ img_mod1_gate,
+ img_mod2_shift,
+ img_mod2_scale,
+ img_mod2_gate,
+ ) = self.img_mod(vec).chunk(6, dim=-1)
+ (
+ txt_mod1_shift,
+ txt_mod1_scale,
+ txt_mod1_gate,
+ txt_mod2_shift,
+ txt_mod2_scale,
+ txt_mod2_gate,
+ ) = self.txt_mod(vec).chunk(6, dim=-1)
+
+ # Prepare image for attention.
+ img_modulated = self.img_norm1(img)
+ img_modulated = modulate(
+ img_modulated, shift=img_mod1_shift, scale=img_mod1_scale
+ )
+ img_qkv = self.img_attn_qkv(img_modulated)
+ img_q, img_k, img_v = rearrange(
+ img_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num
+ )
+ # Apply QK-Norm if needed
+ img_q = self.img_attn_q_norm(img_q).to(img_v)
+ img_k = self.img_attn_k_norm(img_k).to(img_v)
+
+ # Apply RoPE if needed.
+ if freqs_cis is not None:
+ img_qq, img_kk = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+ assert (
+ img_qq.shape == img_q.shape and img_kk.shape == img_k.shape
+ ), f"img_kk: {img_qq.shape}, img_q: {img_q.shape}, img_kk: {img_kk.shape}, img_k: {img_k.shape}"
+ img_q, img_k = img_qq, img_kk
+
+ # Prepare txt for attention.
+ txt_modulated = self.txt_norm1(txt)
+ txt_modulated = modulate(
+ txt_modulated, shift=txt_mod1_shift, scale=txt_mod1_scale
+ )
+ txt_qkv = self.txt_attn_qkv(txt_modulated)
+ txt_q, txt_k, txt_v = rearrange(
+ txt_qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num
+ )
+ # Apply QK-Norm if needed.
+ txt_q = self.txt_attn_q_norm(txt_q).to(txt_v)
+ txt_k = self.txt_attn_k_norm(txt_k).to(txt_v)
+
+ # Run actual attention.
+ q = torch.cat((img_q, txt_q), dim=1)
+ k = torch.cat((img_k, txt_k), dim=1)
+ v = torch.cat((img_v, txt_v), dim=1)
+ assert (
+ cu_seqlens_q.shape[0] == 2 * img.shape[0] + 1
+ ), f"cu_seqlens_q.shape:{cu_seqlens_q.shape}, img.shape[0]:{img.shape[0]}"
+
+ # attention computation start
+ if not self.hybrid_seq_parallel_attn:
+ attn = attention(
+ q,
+ k,
+ v,
+ cu_seqlens_q=cu_seqlens_q,
+ cu_seqlens_kv=cu_seqlens_kv,
+ max_seqlen_q=max_seqlen_q,
+ max_seqlen_kv=max_seqlen_kv,
+ batch_size=img_k.shape[0],
+ )
+ else:
+ attn = parallel_attention(
+ self.hybrid_seq_parallel_attn,
+ q,
+ k,
+ v,
+ img_q_len=img_q.shape[1],
+ img_kv_len=img_k.shape[1],
+ cu_seqlens_q=cu_seqlens_q,
+ cu_seqlens_kv=cu_seqlens_kv
+ )
+
+ # attention computation end
+
+ img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1] :]
+
+ # Calculate the img bloks.
+ img = img + apply_gate(self.img_attn_proj(img_attn), gate=img_mod1_gate)
+ img = img + apply_gate(
+ self.img_mlp(
+ modulate(
+ self.img_norm2(img), shift=img_mod2_shift, scale=img_mod2_scale
+ )
+ ),
+ gate=img_mod2_gate,
+ )
+
+ # Calculate the txt bloks.
+ txt = txt + apply_gate(self.txt_attn_proj(txt_attn), gate=txt_mod1_gate)
+ txt = txt + apply_gate(
+ self.txt_mlp(
+ modulate(
+ self.txt_norm2(txt), shift=txt_mod2_shift, scale=txt_mod2_scale
+ )
+ ),
+ gate=txt_mod2_gate,
+ )
+
+ return img, txt
+
+
+class MMSingleStreamBlock(nn.Module):
+ """
+ A DiT block with parallel linear layers as described in
+ https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+ Also refer to (SD3): https://arxiv.org/abs/2403.03206
+ (Flux.1): https://github.com/black-forest-labs/flux
+ """
+
+ def __init__(
+ self,
+ hidden_size: int,
+ heads_num: int,
+ mlp_width_ratio: float = 4.0,
+ mlp_act_type: str = "gelu_tanh",
+ qk_norm: bool = True,
+ qk_norm_type: str = "rms",
+ qk_scale: float = None,
+ dtype: Optional[torch.dtype] = None,
+ device: Optional[torch.device] = None,
+ ):
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super().__init__()
+
+ self.deterministic = False
+ self.hidden_size = hidden_size
+ self.heads_num = heads_num
+ head_dim = hidden_size // heads_num
+ mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+ self.mlp_hidden_dim = mlp_hidden_dim
+ self.scale = qk_scale or head_dim ** -0.5
+
+ # qkv and mlp_in
+ self.linear1 = nn.Linear(
+ hidden_size, hidden_size * 3 + mlp_hidden_dim, **factory_kwargs
+ )
+ # proj and mlp_out
+ self.linear2 = nn.Linear(
+ hidden_size + mlp_hidden_dim, hidden_size, **factory_kwargs
+ )
+
+ qk_norm_layer = get_norm_layer(qk_norm_type)
+ self.q_norm = (
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+ if qk_norm
+ else nn.Identity()
+ )
+ self.k_norm = (
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+ if qk_norm
+ else nn.Identity()
+ )
+
+ self.pre_norm = nn.LayerNorm(
+ hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs
+ )
+
+ self.mlp_act = get_activation_layer(mlp_act_type)()
+ self.modulation = ModulateDiT(
+ hidden_size,
+ factor=3,
+ act_layer=get_activation_layer("silu"),
+ **factory_kwargs,
+ )
+ self.hybrid_seq_parallel_attn = None
+
+ def enable_deterministic(self):
+ self.deterministic = True
+
+ def disable_deterministic(self):
+ self.deterministic = False
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ vec: torch.Tensor,
+ txt_len: int,
+ cu_seqlens_q: Optional[torch.Tensor] = None,
+ cu_seqlens_kv: Optional[torch.Tensor] = None,
+ max_seqlen_q: Optional[int] = None,
+ max_seqlen_kv: Optional[int] = None,
+ freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+ ) -> torch.Tensor:
+ mod_shift, mod_scale, mod_gate = self.modulation(vec).chunk(3, dim=-1)
+ x_mod = modulate(self.pre_norm(x), shift=mod_shift, scale=mod_scale)
+ qkv, mlp = torch.split(
+ self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1
+ )
+
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+
+ # Apply QK-Norm if needed.
+ q = self.q_norm(q).to(v)
+ k = self.k_norm(k).to(v)
+
+ # Apply RoPE if needed.
+ if freqs_cis is not None:
+ img_q, txt_q = q[:, :-txt_len, :, :], q[:, -txt_len:, :, :]
+ img_k, txt_k = k[:, :-txt_len, :, :], k[:, -txt_len:, :, :]
+ img_qq, img_kk = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+ assert (
+ img_qq.shape == img_q.shape and img_kk.shape == img_k.shape
+ ), f"img_kk: {img_qq.shape}, img_q: {img_q.shape}, img_kk: {img_kk.shape}, img_k: {img_k.shape}"
+ img_q, img_k = img_qq, img_kk
+ q = torch.cat((img_q, txt_q), dim=1)
+ k = torch.cat((img_k, txt_k), dim=1)
+
+ # Compute attention.
+ assert (
+ cu_seqlens_q.shape[0] == 2 * x.shape[0] + 1
+ ), f"cu_seqlens_q.shape:{cu_seqlens_q.shape}, x.shape[0]:{x.shape[0]}"
+
+ # attention computation start
+ if not self.hybrid_seq_parallel_attn:
+ attn = attention(
+ q,
+ k,
+ v,
+ cu_seqlens_q=cu_seqlens_q,
+ cu_seqlens_kv=cu_seqlens_kv,
+ max_seqlen_q=max_seqlen_q,
+ max_seqlen_kv=max_seqlen_kv,
+ batch_size=x.shape[0],
+ )
+ else:
+ attn = parallel_attention(
+ self.hybrid_seq_parallel_attn,
+ q,
+ k,
+ v,
+ img_q_len=img_q.shape[1],
+ img_kv_len=img_k.shape[1],
+ cu_seqlens_q=cu_seqlens_q,
+ cu_seqlens_kv=cu_seqlens_kv
+ )
+ # attention computation end
+
+ # Compute activation in mlp stream, cat again and run second linear layer.
+ output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+ return x + apply_gate(output, gate=mod_gate)
+
+
+class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
+ """
+ HunyuanVideo Transformer backbone
+
+ Inherited from ModelMixin and ConfigMixin for compatibility with diffusers' sampler StableDiffusionPipeline.
+
+ Reference:
+ [1] Flux.1: https://github.com/black-forest-labs/flux
+ [2] MMDiT: http://arxiv.org/abs/2403.03206
+
+ Parameters
+ ----------
+ args: argparse.Namespace
+ The arguments parsed by argparse.
+ patch_size: list
+ The size of the patch.
+ in_channels: int
+ The number of input channels.
+ out_channels: int
+ The number of output channels.
+ hidden_size: int
+ The hidden size of the transformer backbone.
+ heads_num: int
+ The number of attention heads.
+ mlp_width_ratio: float
+ The ratio of the hidden size of the MLP in the transformer block.
+ mlp_act_type: str
+ The activation function of the MLP in the transformer block.
+ depth_double_blocks: int
+ The number of transformer blocks in the double blocks.
+ depth_single_blocks: int
+ The number of transformer blocks in the single blocks.
+ rope_dim_list: list
+ The dimension of the rotary embedding for t, h, w.
+ qkv_bias: bool
+ Whether to use bias in the qkv linear layer.
+ qk_norm: bool
+ Whether to use qk norm.
+ qk_norm_type: str
+ The type of qk norm.
+ guidance_embed: bool
+ Whether to use guidance embedding for distillation.
+ text_projection: str
+ The type of the text projection, default is single_refiner.
+ use_attention_mask: bool
+ Whether to use attention mask for text encoder.
+ dtype: torch.dtype
+ The dtype of the model.
+ device: torch.device
+ The device of the model.
+ """
+
+ @register_to_config
+ def __init__(
+ self,
+ args: Any,
+ patch_size: list = [1, 2, 2],
+ in_channels: int = 4, # Should be VAE.config.latent_channels.
+ out_channels: int = None,
+ hidden_size: int = 3072,
+ heads_num: int = 24,
+ mlp_width_ratio: float = 4.0,
+ mlp_act_type: str = "gelu_tanh",
+ mm_double_blocks_depth: int = 20,
+ mm_single_blocks_depth: int = 40,
+ rope_dim_list: List[int] = [16, 56, 56],
+ qkv_bias: bool = True,
+ qk_norm: bool = True,
+ qk_norm_type: str = "rms",
+ guidance_embed: bool = False, # For modulation.
+ text_projection: str = "single_refiner",
+ use_attention_mask: bool = True,
+ dtype: Optional[torch.dtype] = None,
+ device: Optional[torch.device] = None,
+ ):
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super().__init__()
+
+ self.patch_size = patch_size
+ self.in_channels = in_channels
+ self.out_channels = in_channels if out_channels is None else out_channels
+ self.unpatchify_channels = self.out_channels
+ self.guidance_embed = guidance_embed
+ self.rope_dim_list = rope_dim_list
+
+ # Text projection. Default to linear projection.
+ # Alternative: TokenRefiner. See more details (LI-DiT): http://arxiv.org/abs/2406.11831
+ self.use_attention_mask = use_attention_mask
+ self.text_projection = text_projection
+
+ self.text_states_dim = args.text_states_dim
+ self.text_states_dim_2 = args.text_states_dim_2
+
+ if hidden_size % heads_num != 0:
+ raise ValueError(
+ f"Hidden size {hidden_size} must be divisible by heads_num {heads_num}"
+ )
+ pe_dim = hidden_size // heads_num
+ if sum(rope_dim_list) != pe_dim:
+ raise ValueError(
+ f"Got {rope_dim_list} but expected positional dim {pe_dim}"
+ )
+ self.hidden_size = hidden_size
+ self.heads_num = heads_num
+
+ # image projection
+ self.img_in = PatchEmbed(
+ self.patch_size, self.in_channels, self.hidden_size, **factory_kwargs
+ )
+
+ # text projection
+ if self.text_projection == "linear":
+ self.txt_in = TextProjection(
+ self.text_states_dim,
+ self.hidden_size,
+ get_activation_layer("silu"),
+ **factory_kwargs,
+ )
+ elif self.text_projection == "single_refiner":
+ self.txt_in = SingleTokenRefiner(
+ self.text_states_dim, hidden_size, heads_num, depth=2, **factory_kwargs
+ )
+ else:
+ raise NotImplementedError(
+ f"Unsupported text_projection: {self.text_projection}"
+ )
+
+ # time modulation
+ self.time_in = TimestepEmbedder(
+ self.hidden_size, get_activation_layer("silu"), **factory_kwargs
+ )
+
+ # text modulation
+ self.vector_in = MLPEmbedder(
+ self.text_states_dim_2, self.hidden_size, **factory_kwargs
+ )
+
+ # guidance modulation
+ self.guidance_in = (
+ TimestepEmbedder(
+ self.hidden_size, get_activation_layer("silu"), **factory_kwargs
+ )
+ if guidance_embed
+ else None
+ )
+
+ # double blocks
+ self.double_blocks = nn.ModuleList(
+ [
+ MMDoubleStreamBlock(
+ self.hidden_size,
+ self.heads_num,
+ mlp_width_ratio=mlp_width_ratio,
+ mlp_act_type=mlp_act_type,
+ qk_norm=qk_norm,
+ qk_norm_type=qk_norm_type,
+ qkv_bias=qkv_bias,
+ **factory_kwargs,
+ )
+ for _ in range(mm_double_blocks_depth)
+ ]
+ )
+
+ # single blocks
+ self.single_blocks = nn.ModuleList(
+ [
+ MMSingleStreamBlock(
+ self.hidden_size,
+ self.heads_num,
+ mlp_width_ratio=mlp_width_ratio,
+ mlp_act_type=mlp_act_type,
+ qk_norm=qk_norm,
+ qk_norm_type=qk_norm_type,
+ **factory_kwargs,
+ )
+ for _ in range(mm_single_blocks_depth)
+ ]
+ )
+
+ self.final_layer = FinalLayer(
+ self.hidden_size,
+ self.patch_size,
+ self.out_channels,
+ get_activation_layer("silu"),
+ **factory_kwargs,
+ )
+
+ def enable_deterministic(self):
+ for block in self.double_blocks:
+ block.enable_deterministic()
+ for block in self.single_blocks:
+ block.enable_deterministic()
+
+ def disable_deterministic(self):
+ for block in self.double_blocks:
+ block.disable_deterministic()
+ for block in self.single_blocks:
+ block.disable_deterministic()
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ t: torch.Tensor, # Should be in range(0, 1000).
+ text_states: torch.Tensor = None,
+ text_mask: torch.Tensor = None, # Now we don't use it.
+ text_states_2: Optional[torch.Tensor] = None, # Text embedding for modulation.
+ freqs_cos: Optional[torch.Tensor] = None,
+ freqs_sin: Optional[torch.Tensor] = None,
+ guidance: torch.Tensor = None, # Guidance for modulation, should be cfg_scale x 1000.
+ return_dict: bool = True,
+ ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+ out = {}
+ img = x
+ txt = text_states
+ _, _, ot, oh, ow = x.shape
+ tt, th, tw = (
+ ot // self.patch_size[0],
+ oh // self.patch_size[1],
+ ow // self.patch_size[2],
+ )
+
+ # Prepare modulation vectors.
+ vec = self.time_in(t)
+
+ # text modulation
+ vec = vec + self.vector_in(text_states_2)
+
+ # guidance modulation
+ if self.guidance_embed:
+ if guidance is None:
+ raise ValueError(
+ "Didn't get guidance strength for guidance distilled model."
+ )
+
+ # our timestep_embedding is merged into guidance_in(TimestepEmbedder)
+ vec = vec + self.guidance_in(guidance)
+
+ # Embed image and text.
+ img = self.img_in(img)
+ if self.text_projection == "linear":
+ txt = self.txt_in(txt)
+ elif self.text_projection == "single_refiner":
+ txt = self.txt_in(txt, t, text_mask if self.use_attention_mask else None)
+ else:
+ raise NotImplementedError(
+ f"Unsupported text_projection: {self.text_projection}"
+ )
+
+ txt_seq_len = txt.shape[1]
+ img_seq_len = img.shape[1]
+
+ # Compute cu_squlens and max_seqlen for flash attention
+ cu_seqlens_q = get_cu_seqlens(text_mask, img_seq_len)
+ cu_seqlens_kv = cu_seqlens_q
+ max_seqlen_q = img_seq_len + txt_seq_len
+ max_seqlen_kv = max_seqlen_q
+
+ freqs_cis = (freqs_cos, freqs_sin) if freqs_cos is not None else None
+ # --------------------- Pass through DiT blocks ------------------------
+ for _, block in enumerate(self.double_blocks):
+ double_block_args = [
+ img,
+ txt,
+ vec,
+ cu_seqlens_q,
+ cu_seqlens_kv,
+ max_seqlen_q,
+ max_seqlen_kv,
+ freqs_cis,
+ ]
+
+ img, txt = block(*double_block_args)
+
+ # Merge txt and img to pass through single stream blocks.
+ x = torch.cat((img, txt), 1)
+ if len(self.single_blocks) > 0:
+ for _, block in enumerate(self.single_blocks):
+ single_block_args = [
+ x,
+ vec,
+ txt_seq_len,
+ cu_seqlens_q,
+ cu_seqlens_kv,
+ max_seqlen_q,
+ max_seqlen_kv,
+ (freqs_cos, freqs_sin),
+ ]
+
+ x = block(*single_block_args)
+
+ img = x[:, :img_seq_len, ...]
+
+ # ---------------------------- Final layer ------------------------------
+ img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels)
+
+ img = self.unpatchify(img, tt, th, tw)
+ if return_dict:
+ out["x"] = img
+ return out
+ return img
+
+ def unpatchify(self, x, t, h, w):
+ """
+ x: (N, T, patch_size**2 * C)
+ imgs: (N, H, W, C)
+ """
+ c = self.unpatchify_channels
+ pt, ph, pw = self.patch_size
+ assert t * h * w == x.shape[1]
+
+ x = x.reshape(shape=(x.shape[0], t, h, w, c, pt, ph, pw))
+ x = torch.einsum("nthwcopq->nctohpwq", x)
+ imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
+
+ return imgs
+
+ def params_count(self):
+ counts = {
+ "double": sum(
+ [
+ sum(p.numel() for p in block.img_attn_qkv.parameters())
+ + sum(p.numel() for p in block.img_attn_proj.parameters())
+ + sum(p.numel() for p in block.img_mlp.parameters())
+ + sum(p.numel() for p in block.txt_attn_qkv.parameters())
+ + sum(p.numel() for p in block.txt_attn_proj.parameters())
+ + sum(p.numel() for p in block.txt_mlp.parameters())
+ for block in self.double_blocks
+ ]
+ ),
+ "single": sum(
+ [
+ sum(p.numel() for p in block.linear1.parameters())
+ + sum(p.numel() for p in block.linear2.parameters())
+ for block in self.single_blocks
+ ]
+ ),
+ "total": sum(p.numel() for p in self.parameters()),
+ }
+ counts["attn+mlp"] = counts["double"] + counts["single"]
+ return counts
+
+
+#################################################################################
+# HunyuanVideo Configs #
+#################################################################################
+
+HUNYUAN_VIDEO_CONFIG = {
+ "HYVideo-T/2": {
+ "mm_double_blocks_depth": 20,
+ "mm_single_blocks_depth": 40,
+ "rope_dim_list": [16, 56, 56],
+ "hidden_size": 3072,
+ "heads_num": 24,
+ "mlp_width_ratio": 4,
+ },
+ "HYVideo-T/2-cfgdistill": {
+ "mm_double_blocks_depth": 20,
+ "mm_single_blocks_depth": 40,
+ "rope_dim_list": [16, 56, 56],
+ "hidden_size": 3072,
+ "heads_num": 24,
+ "mlp_width_ratio": 4,
+ "guidance_embed": True,
+ },
+}
diff --git a/hyvideo/modules/placement.py b/hyvideo/modules/placement.py
new file mode 100644
index 0000000000000000000000000000000000000000..47a2405586f442bb2e72c1b586b63c699da0a06e
--- /dev/null
+++ b/hyvideo/modules/placement.py
@@ -0,0 +1,389 @@
+import torch
+import triton
+import triton.language as tl
+
+def hunyuan_token_reorder_to_token_major(tensor, fix_len, reorder_len, reorder_num_frame, frame_size):
+ """Reorder it from frame major to token major!"""
+ assert reorder_len == reorder_num_frame * frame_size
+ assert tensor.shape[2] == fix_len + reorder_len
+
+ tensor[:, :, :-fix_len, :] = tensor[:, :, :-fix_len:, :].reshape(tensor.shape[0], tensor.shape[1], reorder_num_frame, frame_size, tensor.shape[3]) \
+ .transpose(2, 3).reshape(tensor.shape[0], tensor.shape[1], reorder_len, tensor.shape[3])
+ return tensor
+
+def hunyuan_token_reorder_to_frame_major(tensor, fix_len, reorder_len, reorder_num_frame, frame_size):
+ """Reorder it from token major to frame major!"""
+ assert reorder_len == reorder_num_frame * frame_size
+ assert tensor.shape[2] == fix_len + reorder_len
+
+ tensor[:, :, :-fix_len:, :] = tensor[:, :, :-fix_len:, :].reshape(tensor.shape[0], tensor.shape[1], frame_size, reorder_num_frame, tensor.shape[3]) \
+ .transpose(2, 3).reshape(tensor.shape[0], tensor.shape[1], reorder_len, tensor.shape[3])
+ return tensor
+
+
+@triton.jit
+def hunyuan_sparse_head_placement_kernel(
+ query_ptr, key_ptr, value_ptr, # [cfg, num_heads, seq_len, head_dim] seq_len = context_length + num_frame * frame_size
+ query_out_ptr, key_out_ptr, value_out_ptr, # [cfg, num_heads, seq_len, head_dim]
+ best_mask_idx_ptr, # [cfg, num_heads]
+ query_stride_b, query_stride_h, query_stride_s, query_stride_d,
+ mask_idx_stride_b, mask_idx_stride_h,
+ seq_len: tl.constexpr,
+ head_dim: tl.constexpr,
+ context_length: tl.constexpr,
+ num_frame: tl.constexpr,
+ frame_size: tl.constexpr,
+ BLOCK_SIZE: tl.constexpr
+):
+ # Copy query, key, value to output
+ # range: [b, h, block_id * block_size: block_id * block_size + block_size, :]
+ cfg = tl.program_id(0)
+ head = tl.program_id(1)
+ block_id = tl.program_id(2)
+
+ start_id = block_id * BLOCK_SIZE
+ end_id = start_id + BLOCK_SIZE
+ end_id = tl.where(end_id > seq_len, seq_len, end_id)
+
+ # Load best mask idx (0 is spatial, 1 is temporal)
+ is_temporal = tl.load(best_mask_idx_ptr + cfg * mask_idx_stride_b + head * mask_idx_stride_h)
+
+ offset_token = tl.arange(0, BLOCK_SIZE) + start_id
+ offset_mask = offset_token < seq_len
+ offset_d = tl.arange(0, head_dim)
+
+ if is_temporal:
+ frame_id = offset_token // frame_size
+ patch_id = offset_token - frame_id * frame_size
+ offset_store_token = tl.where(offset_token >= seq_len - context_length, offset_token, patch_id * num_frame + frame_id)
+
+ offset_load = (cfg * query_stride_b + head * query_stride_h + offset_token[:,None] * query_stride_s) + offset_d[None,:] * query_stride_d
+ offset_query = query_ptr + offset_load
+ offset_key = key_ptr + offset_load
+ offset_value = value_ptr + offset_load
+
+ offset_store = (cfg * query_stride_b + head * query_stride_h + offset_store_token[:,None] * query_stride_s) + offset_d[None,:] * query_stride_d
+ offset_query_out = query_out_ptr + offset_store
+ offset_key_out = key_out_ptr + offset_store
+ offset_value_out = value_out_ptr + offset_store
+
+ # Maybe tune the pipeline here
+ query = tl.load(offset_query, mask=offset_mask[:,None])
+ tl.store(offset_query_out, query, mask=offset_mask[:,None])
+ key = tl.load(offset_key, mask=offset_mask[:,None])
+ tl.store(offset_key_out, key, mask=offset_mask[:,None])
+ value = tl.load(offset_value, mask=offset_mask[:,None])
+ tl.store(offset_value_out, value, mask=offset_mask[:,None])
+
+
+ else:
+ offset_load = (cfg * query_stride_b + head * query_stride_h + offset_token[:,None] * query_stride_s) + offset_d[None,:] * query_stride_d
+ offset_query = query_ptr + offset_load
+ offset_key = key_ptr + offset_load
+ offset_value = value_ptr + offset_load
+
+ offset_store = offset_load
+ offset_query_out = query_out_ptr + offset_store
+ offset_key_out = key_out_ptr + offset_store
+ offset_value_out = value_out_ptr + offset_store
+
+ # Maybe tune the pipeline here
+ query = tl.load(offset_query, mask=offset_mask[:,None])
+ tl.store(offset_query_out, query, mask=offset_mask[:,None])
+ key = tl.load(offset_key, mask=offset_mask[:,None])
+ tl.store(offset_key_out, key, mask=offset_mask[:,None])
+ value = tl.load(offset_value, mask=offset_mask[:,None])
+ tl.store(offset_value_out, value, mask=offset_mask[:,None])
+
+
+def hunyuan_sparse_head_placement(query, key, value, query_out, key_out, value_out, best_mask_idx, context_length, num_frame, frame_size):
+ cfg, num_heads, seq_len, head_dim = query.shape
+ BLOCK_SIZE = 128
+ assert seq_len == context_length + num_frame * frame_size
+
+ grid = (cfg, num_heads, (seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE)
+
+ hunyuan_sparse_head_placement_kernel[grid](
+ query, key, value,
+ query_out, key_out, value_out,
+ best_mask_idx,
+ query.stride(0), query.stride(1), query.stride(2), query.stride(3),
+ best_mask_idx.stride(0), best_mask_idx.stride(1),
+ seq_len, head_dim, context_length, num_frame, frame_size,
+ BLOCK_SIZE
+ )
+
+
+def ref_hunyuan_sparse_head_placement(query, key, value, best_mask_idx, context_length, num_frame, frame_size):
+ cfg, num_heads, seq_len, head_dim = query.shape
+ assert seq_len == context_length + num_frame * frame_size
+
+ query_out = query.clone()
+ key_out = key.clone()
+ value_out = value.clone()
+
+ # Spatial
+ query_out[best_mask_idx == 0], key_out[best_mask_idx == 0], value_out[best_mask_idx == 0] = \
+ query[best_mask_idx == 0], key[best_mask_idx == 0], value[best_mask_idx == 0]
+
+ # Temporal
+ query_out[best_mask_idx == 1], key_out[best_mask_idx == 1], value_out[best_mask_idx == 1] = \
+ hunyuan_token_reorder_to_token_major(query[best_mask_idx == 1].unsqueeze(0), context_length, num_frame * frame_size, num_frame, frame_size).squeeze(0), \
+ hunyuan_token_reorder_to_token_major(key[best_mask_idx == 1].unsqueeze(0), context_length, num_frame * frame_size, num_frame, frame_size).squeeze(0), \
+ hunyuan_token_reorder_to_token_major(value[best_mask_idx == 1].unsqueeze(0), context_length, num_frame * frame_size, num_frame, frame_size).squeeze(0)
+
+ return query_out, key_out, value_out
+
+
+def test_hunyuan_sparse_head_placement():
+
+ context_length = 226
+ num_frame = 11
+ frame_size = 4080
+
+ cfg = 2
+ num_heads = 48
+
+ seq_len = context_length + num_frame * frame_size
+ head_dim = 64
+
+ dtype = torch.bfloat16
+ device = torch.device("cuda")
+
+ query = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+ key = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+ value = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+
+ best_mask_idx = torch.randint(0, 2, (cfg, num_heads), device=device)
+
+ query_out = torch.empty_like(query)
+ key_out = torch.empty_like(key)
+ value_out = torch.empty_like(value)
+
+ hunyuan_sparse_head_placement(query, key, value, query_out, key_out, value_out, best_mask_idx, context_length, num_frame, frame_size)
+ ref_query_out, ref_key_out, ref_value_out = ref_hunyuan_sparse_head_placement(query, key, value, best_mask_idx, context_length, num_frame, frame_size)
+
+ torch.testing.assert_close(query_out, ref_query_out)
+ torch.testing.assert_close(key_out, ref_key_out)
+ torch.testing.assert_close(value_out, ref_value_out)
+
+
+def benchmark_hunyuan_sparse_head_placement():
+ import time
+
+ context_length = 226
+ num_frame = 11
+ frame_size = 4080
+
+ cfg = 2
+ num_heads = 48
+
+ seq_len = context_length + num_frame * frame_size
+ head_dim = 64
+
+ dtype = torch.bfloat16
+ device = torch.device("cuda")
+
+ query = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+ key = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+ value = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+ best_mask_idx = torch.randint(0, 2, (cfg, num_heads), device=device)
+
+ query_out = torch.empty_like(query)
+ key_out = torch.empty_like(key)
+ value_out = torch.empty_like(value)
+
+ warmup = 10
+ all_iter = 1000
+
+ # warmup
+ for _ in range(warmup):
+ hunyuan_sparse_head_placement(query, key, value, query_out, key_out, value_out, best_mask_idx, context_length, num_frame, frame_size)
+
+ torch.cuda.synchronize()
+ start = time.time()
+ for _ in range(all_iter):
+ hunyuan_sparse_head_placement(query, key, value, query_out, key_out, value_out, best_mask_idx, context_length, num_frame, frame_size)
+ torch.cuda.synchronize()
+ end = time.time()
+
+ print(f"Triton Elapsed Time: {(end - start) / all_iter * 1e3:.2f} ms")
+ print(f"Triton Total Bandwidth: {query.nelement() * query.element_size() * 3 * 2 * all_iter / (end - start) / 1e9:.2f} GB/s")
+
+ torch.cuda.synchronize()
+ start = time.time()
+ for _ in range(all_iter):
+ ref_hunyuan_sparse_head_placement(query, key, value, best_mask_idx, context_length, num_frame, frame_size)
+ torch.cuda.synchronize()
+ end = time.time()
+
+ print(f"Reference Elapsed Time: {(end - start) / all_iter * 1e3:.2f} ms")
+ print(f"Reference Total Bandwidth: {query.nelement() * query.element_size() * 3 * 2 * all_iter / (end - start) / 1e9:.2f} GB/s")
+
+
+@triton.jit
+def hunyuan_hidden_states_placement_kernel(
+ hidden_states_ptr, # [cfg, num_heads, seq_len, head_dim] seq_len = context_length + num_frame * frame_size
+ hidden_states_out_ptr, # [cfg, num_heads, seq_len, head_dim]
+ best_mask_idx_ptr, # [cfg, num_heads]
+ hidden_states_stride_b, hidden_states_stride_h, hidden_states_stride_s, hidden_states_stride_d,
+ mask_idx_stride_b, mask_idx_stride_h,
+ seq_len: tl.constexpr,
+ head_dim: tl.constexpr,
+ context_length: tl.constexpr,
+ num_frame: tl.constexpr,
+ frame_size: tl.constexpr,
+ BLOCK_SIZE: tl.constexpr
+):
+ # Copy hidden_states to output
+ # range: [b, h, block_id * block_size: block_id * block_size + block_size, :]
+ cfg = tl.program_id(0)
+ head = tl.program_id(1)
+ block_id = tl.program_id(2)
+
+ start_id = block_id * BLOCK_SIZE
+ end_id = start_id + BLOCK_SIZE
+ end_id = tl.where(end_id > seq_len, seq_len, end_id)
+
+ # Load best mask idx (0 is spatial, 1 is temporal)
+ is_temporal = tl.load(best_mask_idx_ptr + cfg * mask_idx_stride_b + head * mask_idx_stride_h)
+
+ offset_token = tl.arange(0, BLOCK_SIZE) + start_id
+ offset_mask = offset_token < seq_len
+ offset_d = tl.arange(0, head_dim)
+
+ if is_temporal:
+ patch_id = offset_token // num_frame
+ frame_id = offset_token - patch_id * num_frame
+ offset_store_token = tl.where(offset_token >= seq_len - context_length, offset_token, frame_id * frame_size + patch_id)
+
+ offset_load = (cfg * hidden_states_stride_b + head * hidden_states_stride_h + offset_token[:,None] * hidden_states_stride_s) + offset_d[None,:] * hidden_states_stride_d
+ offset_hidden_states = hidden_states_ptr + offset_load
+
+ offset_store = (cfg * hidden_states_stride_b + head * hidden_states_stride_h + offset_store_token[:,None] * hidden_states_stride_s) + offset_d[None,:] * hidden_states_stride_d
+ offset_hidden_states_out = hidden_states_out_ptr + offset_store
+
+ # Maybe tune the pipeline here
+ hidden_states = tl.load(offset_hidden_states, mask=offset_mask[:,None])
+ tl.store(offset_hidden_states_out, hidden_states, mask=offset_mask[:,None])
+ else:
+ offset_load = (cfg * hidden_states_stride_b + head * hidden_states_stride_h + offset_token[:,None] * hidden_states_stride_s) + offset_d[None,:] * hidden_states_stride_d
+ offset_hidden_states = hidden_states_ptr + offset_load
+
+ offset_store = offset_load
+ offset_hidden_states_out = hidden_states_out_ptr + offset_store
+
+ # Maybe tune the pipeline here
+ hidden_states = tl.load(offset_hidden_states, mask=offset_mask[:,None])
+ tl.store(offset_hidden_states_out, hidden_states, mask=offset_mask[:,None])
+
+
+def hunyuan_hidden_states_placement(hidden_states, hidden_states_out, best_mask_idx, context_length, num_frame, frame_size):
+ cfg, num_heads, seq_len, head_dim = hidden_states.shape
+ BLOCK_SIZE = 128
+ assert seq_len == context_length + num_frame * frame_size
+
+ grid = (cfg, num_heads, (seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE)
+
+
+ hunyuan_hidden_states_placement_kernel[grid](
+ hidden_states,
+ hidden_states_out,
+ best_mask_idx,
+ hidden_states.stride(0), hidden_states.stride(1), hidden_states.stride(2), hidden_states.stride(3),
+ best_mask_idx.stride(0), best_mask_idx.stride(1),
+ seq_len, head_dim, context_length, num_frame, frame_size,
+ BLOCK_SIZE
+ )
+
+ return hidden_states_out
+
+def ref_hunyuan_hidden_states_placement(hidden_states, output_hidden_states, best_mask_idx, context_length, num_frame, frame_size):
+ cfg, num_heads, seq_len, head_dim = hidden_states.shape
+ assert seq_len == context_length + num_frame * frame_size
+
+ # Spatial
+ output_hidden_states[best_mask_idx == 0] = hidden_states[best_mask_idx == 0]
+ # Temporal
+ output_hidden_states[best_mask_idx == 1] = hunyuan_token_reorder_to_frame_major(hidden_states[best_mask_idx == 1].unsqueeze(0), context_length, num_frame * frame_size, num_frame, frame_size).squeeze(0)
+
+def test_hunyuan_hidden_states_placement():
+
+ context_length = 226
+ num_frame = 11
+ frame_size = 4080
+
+ cfg = 2
+ num_heads = 48
+
+ seq_len = context_length + num_frame * frame_size
+ head_dim = 64
+
+ dtype = torch.bfloat16
+ device = torch.device("cuda")
+
+ hidden_states = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+ best_mask_idx = torch.randint(0, 2, (cfg, num_heads), device=device)
+
+ hidden_states_out1 = torch.empty_like(hidden_states)
+ hidden_states_out2 = torch.empty_like(hidden_states)
+
+ hunyuan_hidden_states_placement(hidden_states, hidden_states_out1, best_mask_idx, context_length, num_frame, frame_size)
+ ref_hunyuan_hidden_states_placement(hidden_states, hidden_states_out2, best_mask_idx, context_length, num_frame, frame_size)
+
+ torch.testing.assert_close(hidden_states_out1, hidden_states_out2)
+
+def benchmark_hunyuan_hidden_states_placement():
+ import time
+
+ context_length = 226
+ num_frame = 11
+ frame_size = 4080
+
+ cfg = 2
+ num_heads = 48
+
+ seq_len = context_length + num_frame * frame_size
+ head_dim = 64
+
+ dtype = torch.bfloat16
+ device = torch.device("cuda")
+
+ hidden_states = torch.randn(cfg, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+ best_mask_idx = torch.randint(0, 2, (cfg, num_heads), device=device)
+
+ hidden_states_out = torch.empty_like(hidden_states)
+
+ warmup = 10
+ all_iter = 1000
+
+ # warmup
+ for _ in range(warmup):
+ hunyuan_hidden_states_placement(hidden_states, hidden_states_out, best_mask_idx, context_length, num_frame, frame_size)
+
+ torch.cuda.synchronize()
+ start = time.time()
+ for _ in range(all_iter):
+ hunyuan_hidden_states_placement(hidden_states, hidden_states_out, best_mask_idx, context_length, num_frame, frame_size)
+ torch.cuda.synchronize()
+ end = time.time()
+
+ print(f"Triton Elapsed Time: {(end - start) / all_iter * 1e3:.2f} ms")
+ print(f"Triton Total Bandwidth: {hidden_states.nelement() * hidden_states.element_size() * 2 * all_iter / (end - start) / 1e9:.2f} GB/s")
+
+ torch.cuda.synchronize()
+ start = time.time()
+ for _ in range(all_iter):
+ ref_hunyuan_hidden_states_placement(hidden_states, hidden_states.clone(), best_mask_idx, context_length, num_frame, frame_size)
+ torch.cuda.synchronize()
+ end = time.time()
+
+ print(f"Reference Elapsed Time: {(end - start) / all_iter * 1e3:.2f} ms")
+ print(f"Reference Total Bandwidth: {hidden_states.nelement() * hidden_states.element_size() * 2 * all_iter / (end - start) / 1e9:.2f} GB/s")
+
+
+if __name__ == "__main__":
+ test_hunyuan_sparse_head_placement()
+ benchmark_hunyuan_sparse_head_placement()
+ test_hunyuan_hidden_states_placement()
+ benchmark_hunyuan_hidden_states_placement()
diff --git a/hyvideo/modules/posemb_layers.py b/hyvideo/modules/posemb_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3428568c3cf0a3551d143ba640d1dee0f4ac4071
--- /dev/null
+++ b/hyvideo/modules/posemb_layers.py
@@ -0,0 +1,475 @@
+import torch
+from typing import Union, Tuple, List, Optional
+import numpy as np
+
+
+###### Thanks to the RifleX project (https://github.com/thu-ml/RIFLEx/) for this alternative pos embed for long videos
+#
+def get_1d_rotary_pos_embed_riflex(
+ dim: int,
+ pos: Union[np.ndarray, int],
+ theta: float = 10000.0,
+ use_real=False,
+ k: Optional[int] = None,
+ L_test: Optional[int] = None,
+):
+ """
+ RIFLEx: Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+
+ This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end
+ index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64
+ data type.
+
+ Args:
+ dim (`int`): Dimension of the frequency tensor.
+ pos (`np.ndarray` or `int`): Position indices for the frequency tensor. [S] or scalar
+ theta (`float`, *optional*, defaults to 10000.0):
+ Scaling factor for frequency computation. Defaults to 10000.0.
+ use_real (`bool`, *optional*):
+ If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+ k (`int`, *optional*, defaults to None): the index for the intrinsic frequency in RoPE
+ L_test (`int`, *optional*, defaults to None): the number of frames for inference
+ Returns:
+ `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
+ """
+ assert dim % 2 == 0
+
+ if isinstance(pos, int):
+ pos = torch.arange(pos)
+ if isinstance(pos, np.ndarray):
+ pos = torch.from_numpy(pos) # type: ignore # [S]
+
+ freqs = 1.0 / (
+ theta ** (torch.arange(0, dim, 2, device=pos.device)[: (dim // 2)].float() / dim)
+ ) # [D/2]
+
+ # === Riflex modification start ===
+ # Reduce the intrinsic frequency to stay within a single period after extrapolation (see Eq. (8)).
+ # Empirical observations show that a few videos may exhibit repetition in the tail frames.
+ # To be conservative, we multiply by 0.9 to keep the extrapolated length below 90% of a single period.
+ if k is not None:
+ freqs[k-1] = 0.9 * 2 * torch.pi / L_test
+ # === Riflex modification end ===
+
+ freqs = torch.outer(pos, freqs) # type: ignore # [S, D/2]
+ if use_real:
+ freqs_cos = freqs.cos().repeat_interleave(2, dim=1).float() # [S, D]
+ freqs_sin = freqs.sin().repeat_interleave(2, dim=1).float() # [S, D]
+ return freqs_cos, freqs_sin
+ else:
+ # lumina
+ freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 # [S, D/2]
+ return freqs_cis
+
+def identify_k( b: float, d: int, N: int):
+ """
+ This function identifies the index of the intrinsic frequency component in a RoPE-based pre-trained diffusion transformer.
+
+ Args:
+ b (`float`): The base frequency for RoPE.
+ d (`int`): Dimension of the frequency tensor
+ N (`int`): the first observed repetition frame in latent space
+ Returns:
+ k (`int`): the index of intrinsic frequency component
+ N_k (`int`): the period of intrinsic frequency component in latent space
+ Example:
+ In HunyuanVideo, b=256 and d=16, the repetition occurs approximately 8s (N=48 in latent space).
+ k, N_k = identify_k(b=256, d=16, N=48)
+ In this case, the intrinsic frequency index k is 4, and the period N_k is 50.
+ """
+
+ # Compute the period of each frequency in RoPE according to Eq.(4)
+ periods = []
+ for j in range(1, d // 2 + 1):
+ theta_j = 1.0 / (b ** (2 * (j - 1) / d))
+ N_j = round(2 * torch.pi / theta_j)
+ periods.append(N_j)
+
+ # Identify the intrinsic frequency whose period is closed to N(see Eq.(7))
+ diffs = [abs(N_j - N) for N_j in periods]
+ k = diffs.index(min(diffs)) + 1
+ N_k = periods[k-1]
+ return k, N_k
+
+def _to_tuple(x, dim=2):
+ if isinstance(x, int):
+ return (x,) * dim
+ elif len(x) == dim:
+ return x
+ else:
+ raise ValueError(f"Expected length {dim} or int, but got {x}")
+
+
+def get_meshgrid_nd(start, *args, dim=2):
+ """
+ Get n-D meshgrid with start, stop and num.
+
+ Args:
+ start (int or tuple): If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop,
+ step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num. For n-dim, start/stop/num
+ should be int or n-tuple. If n-tuple is provided, the meshgrid will be stacked following the dim order in
+ n-tuples.
+ *args: See above.
+ dim (int): Dimension of the meshgrid. Defaults to 2.
+
+ Returns:
+ grid (np.ndarray): [dim, ...]
+ """
+ if len(args) == 0:
+ # start is grid_size
+ num = _to_tuple(start, dim=dim)
+ start = (0,) * dim
+ stop = num
+ elif len(args) == 1:
+ # start is start, args[0] is stop, step is 1
+ start = _to_tuple(start, dim=dim)
+ stop = _to_tuple(args[0], dim=dim)
+ num = [stop[i] - start[i] for i in range(dim)]
+ elif len(args) == 2:
+ # start is start, args[0] is stop, args[1] is num
+ start = _to_tuple(start, dim=dim) # Left-Top eg: 12,0
+ stop = _to_tuple(args[0], dim=dim) # Right-Bottom eg: 20,32
+ num = _to_tuple(args[1], dim=dim) # Target Size eg: 32,124
+ else:
+ raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
+
+ # PyTorch implement of np.linspace(start[i], stop[i], num[i], endpoint=False)
+ axis_grid = []
+ for i in range(dim):
+ a, b, n = start[i], stop[i], num[i]
+ g = torch.linspace(a, b, n + 1, dtype=torch.float32)[:n]
+ axis_grid.append(g)
+ grid = torch.meshgrid(*axis_grid, indexing="ij") # dim x [W, H, D]
+ grid = torch.stack(grid, dim=0) # [dim, W, H, D]
+
+ return grid
+
+
+#################################################################################
+# Rotary Positional Embedding Functions #
+#################################################################################
+# https://github.com/meta-llama/llama/blob/be327c427cc5e89cc1d3ab3d3fec4484df771245/llama/model.py#L80
+
+
+def reshape_for_broadcast(
+ freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+ x: torch.Tensor,
+ head_first=False,
+):
+ """
+ Reshape frequency tensor for broadcasting it with another tensor.
+
+ This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+ for the purpose of broadcasting the frequency tensor during element-wise operations.
+
+ Notes:
+ When using FlashMHAModified, head_first should be False.
+ When using Attention, head_first should be True.
+
+ Args:
+ freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Frequency tensor to be reshaped.
+ x (torch.Tensor): Target tensor for broadcasting compatibility.
+ head_first (bool): head dimension first (except batch dim) or not.
+
+ Returns:
+ torch.Tensor: Reshaped frequency tensor.
+
+ Raises:
+ AssertionError: If the frequency tensor doesn't match the expected shape.
+ AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
+ """
+ ndim = x.ndim
+ assert 0 <= 1 < ndim
+
+ if isinstance(freqs_cis, tuple):
+ # freqs_cis: (cos, sin) in real space
+ if head_first:
+ assert freqs_cis[0].shape == (
+ x.shape[-2],
+ x.shape[-1],
+ ), f"freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}"
+ shape = [
+ d if i == ndim - 2 or i == ndim - 1 else 1
+ for i, d in enumerate(x.shape)
+ ]
+ else:
+ assert freqs_cis[0].shape == (
+ x.shape[1],
+ x.shape[-1],
+ ), f"freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}"
+ shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+ return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
+ else:
+ # freqs_cis: values in complex space
+ if head_first:
+ assert freqs_cis.shape == (
+ x.shape[-2],
+ x.shape[-1],
+ ), f"freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}"
+ shape = [
+ d if i == ndim - 2 or i == ndim - 1 else 1
+ for i, d in enumerate(x.shape)
+ ]
+ else:
+ assert freqs_cis.shape == (
+ x.shape[1],
+ x.shape[-1],
+ ), f"freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}"
+ shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+ return freqs_cis.view(*shape)
+
+
+def rotate_half(x):
+ x_real, x_imag = (
+ x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)
+ ) # [B, S, H, D//2]
+ return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+
+
+def apply_rotary_emb( qklist,
+ freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+ head_first: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+ """
+ Apply rotary embeddings to input tensors using the given frequency tensor.
+
+ This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
+ frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
+ is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
+ returned as real tensors.
+
+ Args:
+ xq (torch.Tensor): Query tensor to apply rotary embeddings. [B, S, H, D]
+ xk (torch.Tensor): Key tensor to apply rotary embeddings. [B, S, H, D]
+ freqs_cis (torch.Tensor or tuple): Precomputed frequency tensor for complex exponential.
+ head_first (bool): head dimension first (except batch dim) or not.
+
+ Returns:
+ Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+
+ """
+ xq, xk = qklist
+ qklist.clear()
+ xk_out = None
+ if isinstance(freqs_cis, tuple):
+ cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first) # [S, D]
+ cos, sin = cos.to(xq.device), sin.to(xq.device)
+ # real * cos - imag * sin
+ # imag * cos + real * sin
+ xq_dtype = xq.dtype
+ xq_out = xq.to(torch.float)
+ xq = None
+ xq_rot = rotate_half(xq_out)
+ xq_out *= cos
+ xq_rot *= sin
+ xq_out += xq_rot
+ del xq_rot
+ xq_out = xq_out.to(xq_dtype)
+
+ xk_out = xk.to(torch.float)
+ xk = None
+ xk_rot = rotate_half(xk_out)
+ xk_out *= cos
+ xk_rot *= sin
+ xk_out += xk_rot
+ del xk_rot
+ xk_out = xk_out.to(xq_dtype)
+ else:
+ # view_as_complex will pack [..., D/2, 2](real) to [..., D/2](complex)
+ xq_ = torch.view_as_complex(
+ xq.float().reshape(*xq.shape[:-1], -1, 2)
+ ) # [B, S, H, D//2]
+ freqs_cis = reshape_for_broadcast(freqs_cis, xq_, head_first).to(
+ xq.device
+ ) # [S, D//2] --> [1, S, 1, D//2]
+ # (real, imag) * (cos, sin) = (real * cos - imag * sin, imag * cos + real * sin)
+ # view_as_real will expand [..., D/2](complex) to [..., D/2, 2](real)
+ xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
+ xk_ = torch.view_as_complex(
+ xk.float().reshape(*xk.shape[:-1], -1, 2)
+ ) # [B, S, H, D//2]
+ xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
+
+ return xq_out, xk_out
+
+def get_nd_rotary_pos_embed_new(rope_dim_list, start, *args, theta=10000., use_real=False,
+ theta_rescale_factor: Union[float, List[float]]=1.0,
+ interpolation_factor: Union[float, List[float]]=1.0,
+ concat_dict={}
+ ):
+
+ grid = get_meshgrid_nd(start, *args, dim=len(rope_dim_list)) # [3, W, H, D] / [2, W, H]
+ if len(concat_dict)<1:
+ pass
+ else:
+ if concat_dict['mode']=='timecat':
+ bias = grid[:,:1].clone()
+ bias[0] = concat_dict['bias']*torch.ones_like(bias[0])
+ grid = torch.cat([bias, grid], dim=1)
+
+ elif concat_dict['mode']=='timecat-w':
+ bias = grid[:,:1].clone()
+ bias[0] = concat_dict['bias']*torch.ones_like(bias[0])
+ bias[2] += start[-1] ## ref https://github.com/Yuanshi9815/OminiControl/blob/main/src/generate.py#L178
+ grid = torch.cat([bias, grid], dim=1)
+ if isinstance(theta_rescale_factor, int) or isinstance(theta_rescale_factor, float):
+ theta_rescale_factor = [theta_rescale_factor] * len(rope_dim_list)
+ elif isinstance(theta_rescale_factor, list) and len(theta_rescale_factor) == 1:
+ theta_rescale_factor = [theta_rescale_factor[0]] * len(rope_dim_list)
+ assert len(theta_rescale_factor) == len(rope_dim_list), "len(theta_rescale_factor) should equal to len(rope_dim_list)"
+
+ if isinstance(interpolation_factor, int) or isinstance(interpolation_factor, float):
+ interpolation_factor = [interpolation_factor] * len(rope_dim_list)
+ elif isinstance(interpolation_factor, list) and len(interpolation_factor) == 1:
+ interpolation_factor = [interpolation_factor[0]] * len(rope_dim_list)
+ assert len(interpolation_factor) == len(rope_dim_list), "len(interpolation_factor) should equal to len(rope_dim_list)"
+
+ # use 1/ndim of dimensions to encode grid_axis
+ embs = []
+ for i in range(len(rope_dim_list)):
+ emb = get_1d_rotary_pos_embed(rope_dim_list[i], grid[i].reshape(-1), theta, use_real=use_real,
+ theta_rescale_factor=theta_rescale_factor[i],
+ interpolation_factor=interpolation_factor[i]) # 2 x [WHD, rope_dim_list[i]]
+
+ embs.append(emb)
+
+ if use_real:
+ cos = torch.cat([emb[0] for emb in embs], dim=1) # (WHD, D/2)
+ sin = torch.cat([emb[1] for emb in embs], dim=1) # (WHD, D/2)
+ return cos, sin
+ else:
+ emb = torch.cat(embs, dim=1) # (WHD, D/2)
+ return emb
+
+def get_nd_rotary_pos_embed(
+ rope_dim_list,
+ start,
+ *args,
+ theta=10000.0,
+ use_real=False,
+ theta_rescale_factor: Union[float, List[float]] = 1.0,
+ interpolation_factor: Union[float, List[float]] = 1.0,
+ k = 4,
+ L_test = 66,
+ enable_riflex = True
+):
+ """
+ This is a n-d version of precompute_freqs_cis, which is a RoPE for tokens with n-d structure.
+
+ Args:
+ rope_dim_list (list of int): Dimension of each rope. len(rope_dim_list) should equal to n.
+ sum(rope_dim_list) should equal to head_dim of attention layer.
+ start (int | tuple of int | list of int): If len(args) == 0, start is num; If len(args) == 1, start is start,
+ args[0] is stop, step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num.
+ *args: See above.
+ theta (float): Scaling factor for frequency computation. Defaults to 10000.0.
+ use_real (bool): If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+ Some libraries such as TensorRT does not support complex64 data type. So it is useful to provide a real
+ part and an imaginary part separately.
+ theta_rescale_factor (float): Rescale factor for theta. Defaults to 1.0.
+
+ Returns:
+ pos_embed (torch.Tensor): [HW, D/2]
+ """
+
+ grid = get_meshgrid_nd(
+ start, *args, dim=len(rope_dim_list)
+ ) # [3, W, H, D] / [2, W, H]
+
+ if isinstance(theta_rescale_factor, int) or isinstance(theta_rescale_factor, float):
+ theta_rescale_factor = [theta_rescale_factor] * len(rope_dim_list)
+ elif isinstance(theta_rescale_factor, list) and len(theta_rescale_factor) == 1:
+ theta_rescale_factor = [theta_rescale_factor[0]] * len(rope_dim_list)
+ assert len(theta_rescale_factor) == len(
+ rope_dim_list
+ ), "len(theta_rescale_factor) should equal to len(rope_dim_list)"
+
+ if isinstance(interpolation_factor, int) or isinstance(interpolation_factor, float):
+ interpolation_factor = [interpolation_factor] * len(rope_dim_list)
+ elif isinstance(interpolation_factor, list) and len(interpolation_factor) == 1:
+ interpolation_factor = [interpolation_factor[0]] * len(rope_dim_list)
+ assert len(interpolation_factor) == len(
+ rope_dim_list
+ ), "len(interpolation_factor) should equal to len(rope_dim_list)"
+
+ # use 1/ndim of dimensions to encode grid_axis
+ embs = []
+ for i in range(len(rope_dim_list)):
+ # emb = get_1d_rotary_pos_embed(
+ # rope_dim_list[i],
+ # grid[i].reshape(-1),
+ # theta,
+ # use_real=use_real,
+ # theta_rescale_factor=theta_rescale_factor[i],
+ # interpolation_factor=interpolation_factor[i],
+ # ) # 2 x [WHD, rope_dim_list[i]]
+
+
+ # === RIFLEx modification start ===
+ # apply RIFLEx for time dimension
+ if i == 0 and enable_riflex:
+ emb = get_1d_rotary_pos_embed_riflex(rope_dim_list[i], grid[i].reshape(-1), theta, use_real=True, k=k, L_test=L_test)
+ # === RIFLEx modification end ===
+ else:
+ emb = get_1d_rotary_pos_embed(rope_dim_list[i], grid[i].reshape(-1), theta, use_real=True, theta_rescale_factor=theta_rescale_factor[i],interpolation_factor=interpolation_factor[i],)
+ embs.append(emb)
+
+ if use_real:
+ cos = torch.cat([emb[0] for emb in embs], dim=1) # (WHD, D/2)
+ sin = torch.cat([emb[1] for emb in embs], dim=1) # (WHD, D/2)
+ return cos, sin
+ else:
+ emb = torch.cat(embs, dim=1) # (WHD, D/2)
+ return emb
+
+
+def get_1d_rotary_pos_embed(
+ dim: int,
+ pos: Union[torch.FloatTensor, int],
+ theta: float = 10000.0,
+ use_real: bool = False,
+ theta_rescale_factor: float = 1.0,
+ interpolation_factor: float = 1.0,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+ """
+ Precompute the frequency tensor for complex exponential (cis) with given dimensions.
+ (Note: `cis` means `cos + i * sin`, where i is the imaginary unit.)
+
+ This function calculates a frequency tensor with complex exponential using the given dimension 'dim'
+ and the end index 'end'. The 'theta' parameter scales the frequencies.
+ The returned tensor contains complex values in complex64 data type.
+
+ Args:
+ dim (int): Dimension of the frequency tensor.
+ pos (int or torch.FloatTensor): Position indices for the frequency tensor. [S] or scalar
+ theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+ use_real (bool, optional): If True, return real part and imaginary part separately.
+ Otherwise, return complex numbers.
+ theta_rescale_factor (float, optional): Rescale factor for theta. Defaults to 1.0.
+
+ Returns:
+ freqs_cis: Precomputed frequency tensor with complex exponential. [S, D/2]
+ freqs_cos, freqs_sin: Precomputed frequency tensor with real and imaginary parts separately. [S, D]
+ """
+ if isinstance(pos, int):
+ pos = torch.arange(pos).float()
+
+ # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+ # has some connection to NTK literature
+ if theta_rescale_factor != 1.0:
+ theta *= theta_rescale_factor ** (dim / (dim - 2))
+
+ freqs = 1.0 / (
+ theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+ ) # [D/2]
+ # assert interpolation_factor == 1.0, f"interpolation_factor: {interpolation_factor}"
+ freqs = torch.outer(pos * interpolation_factor, freqs) # [S, D/2]
+ if use_real:
+ freqs_cos = freqs.cos().repeat_interleave(2, dim=1) # [S, D]
+ freqs_sin = freqs.sin().repeat_interleave(2, dim=1) # [S, D]
+ return freqs_cos, freqs_sin
+ else:
+ freqs_cis = torch.polar(
+ torch.ones_like(freqs), freqs
+ ) # complex64 # [S, D/2]
+ return freqs_cis
diff --git a/hyvideo/modules/token_refiner.py b/hyvideo/modules/token_refiner.py
new file mode 100644
index 0000000000000000000000000000000000000000..c173032f8070bf9f98a80c40f3f722f82ba2f891
--- /dev/null
+++ b/hyvideo/modules/token_refiner.py
@@ -0,0 +1,237 @@
+from typing import Optional
+
+from einops import rearrange
+import torch
+import torch.nn as nn
+
+from .activation_layers import get_activation_layer
+from .attenion import attention
+from .norm_layers import get_norm_layer
+from .embed_layers import TimestepEmbedder, TextProjection
+from .attenion import attention
+from .mlp_layers import MLP
+from .modulate_layers import modulate, apply_gate
+
+
+class IndividualTokenRefinerBlock(nn.Module):
+ def __init__(
+ self,
+ hidden_size,
+ heads_num,
+ mlp_width_ratio: str = 4.0,
+ mlp_drop_rate: float = 0.0,
+ act_type: str = "silu",
+ qk_norm: bool = False,
+ qk_norm_type: str = "layer",
+ qkv_bias: bool = True,
+ dtype: Optional[torch.dtype] = None,
+ device: Optional[torch.device] = None,
+ ):
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super().__init__()
+ self.heads_num = heads_num
+ head_dim = hidden_size // heads_num
+ mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+
+ self.norm1 = nn.LayerNorm(
+ hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
+ )
+ self.self_attn_qkv = nn.Linear(
+ hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs
+ )
+ qk_norm_layer = get_norm_layer(qk_norm_type)
+ self.self_attn_q_norm = (
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+ if qk_norm
+ else nn.Identity()
+ )
+ self.self_attn_k_norm = (
+ qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+ if qk_norm
+ else nn.Identity()
+ )
+ self.self_attn_proj = nn.Linear(
+ hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
+ )
+
+ self.norm2 = nn.LayerNorm(
+ hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
+ )
+ act_layer = get_activation_layer(act_type)
+ self.mlp = MLP(
+ in_channels=hidden_size,
+ hidden_channels=mlp_hidden_dim,
+ act_layer=act_layer,
+ drop=mlp_drop_rate,
+ **factory_kwargs,
+ )
+
+ self.adaLN_modulation = nn.Sequential(
+ act_layer(),
+ nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
+ )
+ # Zero-initialize the modulation
+ nn.init.zeros_(self.adaLN_modulation[1].weight)
+ nn.init.zeros_(self.adaLN_modulation[1].bias)
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ c: torch.Tensor, # timestep_aware_representations + context_aware_representations
+ attn_mask: torch.Tensor = None,
+ ):
+ gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
+
+ norm_x = self.norm1(x)
+ qkv = self.self_attn_qkv(norm_x)
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+ # Apply QK-Norm if needed
+ q = self.self_attn_q_norm(q).to(v)
+ k = self.self_attn_k_norm(k).to(v)
+ qkv_list = [q, k, v]
+ del q,k
+ # Self-Attention
+ attn = attention( qkv_list, mode="torch", attn_mask=attn_mask)
+
+ x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
+
+ # FFN Layer
+ x = x + apply_gate(self.mlp(self.norm2(x)), gate_mlp)
+
+ return x
+
+
+class IndividualTokenRefiner(nn.Module):
+ def __init__(
+ self,
+ hidden_size,
+ heads_num,
+ depth,
+ mlp_width_ratio: float = 4.0,
+ mlp_drop_rate: float = 0.0,
+ act_type: str = "silu",
+ qk_norm: bool = False,
+ qk_norm_type: str = "layer",
+ qkv_bias: bool = True,
+ dtype: Optional[torch.dtype] = None,
+ device: Optional[torch.device] = None,
+ ):
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super().__init__()
+ self.blocks = nn.ModuleList(
+ [
+ IndividualTokenRefinerBlock(
+ hidden_size=hidden_size,
+ heads_num=heads_num,
+ mlp_width_ratio=mlp_width_ratio,
+ mlp_drop_rate=mlp_drop_rate,
+ act_type=act_type,
+ qk_norm=qk_norm,
+ qk_norm_type=qk_norm_type,
+ qkv_bias=qkv_bias,
+ **factory_kwargs,
+ )
+ for _ in range(depth)
+ ]
+ )
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ c: torch.LongTensor,
+ mask: Optional[torch.Tensor] = None,
+ ):
+ self_attn_mask = None
+ if mask is not None:
+ batch_size = mask.shape[0]
+ seq_len = mask.shape[1]
+ mask = mask.to(x.device)
+ # batch_size x 1 x seq_len x seq_len
+ self_attn_mask_1 = mask.view(batch_size, 1, 1, seq_len).repeat(
+ 1, 1, seq_len, 1
+ )
+ # batch_size x 1 x seq_len x seq_len
+ self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
+ # batch_size x 1 x seq_len x seq_len, 1 for broadcasting of heads_num
+ self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
+ # avoids self-attention weight being NaN for padding tokens
+ self_attn_mask[:, :, :, 0] = True
+
+ for block in self.blocks:
+ x = block(x, c, self_attn_mask)
+ return x
+
+
+class SingleTokenRefiner(nn.Module):
+ """
+ A single token refiner block for llm text embedding refine.
+ """
+ def __init__(
+ self,
+ in_channels,
+ hidden_size,
+ heads_num,
+ depth,
+ mlp_width_ratio: float = 4.0,
+ mlp_drop_rate: float = 0.0,
+ act_type: str = "silu",
+ qk_norm: bool = False,
+ qk_norm_type: str = "layer",
+ qkv_bias: bool = True,
+ attn_mode: str = "torch",
+ dtype: Optional[torch.dtype] = None,
+ device: Optional[torch.device] = None,
+ ):
+ factory_kwargs = {"device": device, "dtype": dtype}
+ super().__init__()
+ self.attn_mode = attn_mode
+ assert self.attn_mode == "torch", "Only support 'torch' mode for token refiner."
+
+ self.input_embedder = nn.Linear(
+ in_channels, hidden_size, bias=True, **factory_kwargs
+ )
+
+ act_layer = get_activation_layer(act_type)
+ # Build timestep embedding layer
+ self.t_embedder = TimestepEmbedder(hidden_size, act_layer, **factory_kwargs)
+ # Build context embedding layer
+ self.c_embedder = TextProjection(
+ in_channels, hidden_size, act_layer, **factory_kwargs
+ )
+
+ self.individual_token_refiner = IndividualTokenRefiner(
+ hidden_size=hidden_size,
+ heads_num=heads_num,
+ depth=depth,
+ mlp_width_ratio=mlp_width_ratio,
+ mlp_drop_rate=mlp_drop_rate,
+ act_type=act_type,
+ qk_norm=qk_norm,
+ qk_norm_type=qk_norm_type,
+ qkv_bias=qkv_bias,
+ **factory_kwargs,
+ )
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ t: torch.LongTensor,
+ mask: Optional[torch.LongTensor] = None,
+ ):
+ timestep_aware_representations = self.t_embedder(t)
+
+ if mask is None:
+ context_aware_representations = x.mean(dim=1)
+ else:
+ mask_float = mask.float().unsqueeze(-1) # [b, s1, 1]
+ context_aware_representations = (x * mask_float).sum(
+ dim=1
+ ) / mask_float.sum(dim=1)
+ context_aware_representations = self.c_embedder(context_aware_representations.to(x.dtype))
+ c = timestep_aware_representations + context_aware_representations
+
+ x = self.input_embedder(x)
+
+ x = self.individual_token_refiner(x, c, mask)
+
+ return x
diff --git a/hyvideo/modules/utils.py b/hyvideo/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..02a733e1b04c406193b7801bdcaf0c81f72b0e35
--- /dev/null
+++ b/hyvideo/modules/utils.py
@@ -0,0 +1,43 @@
+"""Mask Mod for Image2Video"""
+
+from math import floor
+import torch
+from torch import Tensor
+
+
+from functools import lru_cache
+from typing import Optional, List
+
+import torch
+from torch.nn.attention.flex_attention import (
+ create_block_mask,
+)
+
+
+@lru_cache
+def create_block_mask_cached(score_mod, B, H, M, N, device="cuda", _compile=False):
+ block_mask = create_block_mask(score_mod, B, H, M, N, device=device, _compile=_compile)
+ return block_mask
+
+def generate_temporal_head_mask_mod(context_length: int = 226, prompt_length: int = 226, num_frames: int = 13, token_per_frame: int = 1350, mul: int = 2):
+
+ def round_to_multiple(idx):
+ return floor(idx / 128) * 128
+
+ real_length = num_frames * token_per_frame + prompt_length
+ def temporal_mask_mod(b, h, q_idx, kv_idx):
+ real_mask = (kv_idx < real_length) & (q_idx < real_length)
+ fake_mask = (kv_idx >= real_length) & (q_idx >= real_length)
+
+ two_frame = round_to_multiple(mul * token_per_frame)
+ temporal_head_mask = (torch.abs(q_idx - kv_idx) < two_frame)
+
+ text_column_mask = (num_frames * token_per_frame <= kv_idx) & (kv_idx < real_length)
+ text_row_mask = (num_frames * token_per_frame <= q_idx) & (q_idx < real_length)
+
+ video_mask = temporal_head_mask | text_column_mask | text_row_mask
+ real_mask = real_mask & video_mask
+
+ return real_mask | fake_mask
+
+ return temporal_mask_mod
diff --git a/hyvideo/prompt_rewrite.py b/hyvideo/prompt_rewrite.py
new file mode 100644
index 0000000000000000000000000000000000000000..974c452a57926b0fc2c50a0e3ce4d86be4b1765a
--- /dev/null
+++ b/hyvideo/prompt_rewrite.py
@@ -0,0 +1,51 @@
+normal_mode_prompt = """Normal mode - Video Recaption Task:
+
+You are a large language model specialized in rewriting video descriptions. Your task is to modify the input description.
+
+0. Preserve ALL information, including style words and technical terms.
+
+1. If the input is in Chinese, translate the entire description to English.
+
+2. If the input is just one or two words describing an object or person, provide a brief, simple description focusing on basic visual characteristics. Limit the description to 1-2 short sentences.
+
+3. If the input does not include style, lighting, atmosphere, you can make reasonable associations.
+
+4. Output ALL must be in English.
+
+Given Input:
+input: "{input}"
+"""
+
+
+master_mode_prompt = """Master mode - Video Recaption Task:
+
+You are a large language model specialized in rewriting video descriptions. Your task is to modify the input description.
+
+0. Preserve ALL information, including style words and technical terms.
+
+1. If the input is in Chinese, translate the entire description to English.
+
+2. If the input is just one or two words describing an object or person, provide a brief, simple description focusing on basic visual characteristics. Limit the description to 1-2 short sentences.
+
+3. If the input does not include style, lighting, atmosphere, you can make reasonable associations.
+
+4. Output ALL must be in English.
+
+Given Input:
+input: "{input}"
+"""
+
+def get_rewrite_prompt(ori_prompt, mode="Normal"):
+ if mode == "Normal":
+ prompt = normal_mode_prompt.format(input=ori_prompt)
+ elif mode == "Master":
+ prompt = master_mode_prompt.format(input=ori_prompt)
+ else:
+ raise Exception("Only supports Normal and Normal", mode)
+ return prompt
+
+ori_prompt = "一只小狗在草地上奔跑。"
+normal_prompt = get_rewrite_prompt(ori_prompt, mode="Normal")
+master_prompt = get_rewrite_prompt(ori_prompt, mode="Master")
+
+# Then you can use the normal_prompt or master_prompt to access the hunyuan-large rewrite model to get the final prompt.
\ No newline at end of file
diff --git a/hyvideo/text_encoder/__init__.py b/hyvideo/text_encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..46af6e3fb15489b046a1b5415f0e676bb0196abb
--- /dev/null
+++ b/hyvideo/text_encoder/__init__.py
@@ -0,0 +1,552 @@
+from dataclasses import dataclass
+from typing import Optional, Tuple
+from copy import deepcopy
+import torch
+import torch.nn as nn
+from transformers import (
+ CLIPTextModel,
+ CLIPTokenizer,
+ AutoTokenizer,
+ AutoModel,
+ LlavaForConditionalGeneration,
+ CLIPImageProcessor,
+)
+from transformers.utils import ModelOutput
+
+from ..constants import TEXT_ENCODER_PATH, TOKENIZER_PATH
+from ..constants import PRECISION_TO_TYPE
+
+
+def use_default(value, default):
+ return value if value is not None else default
+
+
+def load_text_encoder(
+ text_encoder_type,
+ text_encoder_precision=None,
+ text_encoder_path=None,
+ device=None,
+):
+ if text_encoder_path is None:
+ text_encoder_path = TEXT_ENCODER_PATH[text_encoder_type]
+
+ if text_encoder_type == "clipL":
+ text_encoder = CLIPTextModel.from_pretrained(text_encoder_path)
+ text_encoder.final_layer_norm = text_encoder.text_model.final_layer_norm
+ elif text_encoder_type == "llm":
+ text_encoder = AutoModel.from_pretrained(
+ text_encoder_path, low_cpu_mem_usage=True
+ )
+ text_encoder.final_layer_norm = text_encoder.norm
+ elif text_encoder_type == "llm-i2v":
+ text_encoder = LlavaForConditionalGeneration.from_pretrained(
+ text_encoder_path, low_cpu_mem_usage=True
+ )
+ else:
+ raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
+ # from_pretrained will ensure that the model is in eval mode.
+
+ if text_encoder_precision is not None:
+ text_encoder = text_encoder.to(dtype=PRECISION_TO_TYPE[text_encoder_precision])
+
+ text_encoder.requires_grad_(False)
+
+ if device is not None:
+ text_encoder = text_encoder.to(device)
+
+ return text_encoder, text_encoder_path
+
+
+def load_tokenizer(
+ tokenizer_type, tokenizer_path=None, padding_side="right"
+):
+ if tokenizer_path is None:
+ tokenizer_path = TOKENIZER_PATH[tokenizer_type]
+
+ processor = None
+ if tokenizer_type == "clipL":
+ tokenizer = CLIPTokenizer.from_pretrained(tokenizer_path, max_length=77)
+ elif tokenizer_type == "llm":
+ tokenizer = AutoTokenizer.from_pretrained(
+ tokenizer_path, padding_side=padding_side
+ )
+ elif tokenizer_type == "llm-i2v":
+ tokenizer = AutoTokenizer.from_pretrained(
+ tokenizer_path, padding_side=padding_side
+ )
+ processor = CLIPImageProcessor.from_pretrained(tokenizer_path)
+ else:
+ raise ValueError(f"Unsupported tokenizer type: {tokenizer_type}")
+
+ return tokenizer, tokenizer_path, processor
+
+
+@dataclass
+class TextEncoderModelOutput(ModelOutput):
+ """
+ Base class for model's outputs that also contains a pooling of the last hidden states.
+
+ Args:
+ hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+ hidden_states_list (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ text_outputs (`list`, *optional*, returned when `return_texts=True` is passed):
+ List of decoded texts.
+ """
+
+ hidden_state: torch.FloatTensor = None
+ attention_mask: Optional[torch.LongTensor] = None
+ hidden_states_list: Optional[Tuple[torch.FloatTensor, ...]] = None
+ text_outputs: Optional[list] = None
+
+
+class TextEncoder(nn.Module):
+ def __init__(
+ self,
+ text_encoder_type: str,
+ max_length: int,
+ text_encoder_precision: Optional[str] = None,
+ text_encoder_path: Optional[str] = None,
+ tokenizer_type: Optional[str] = None,
+ tokenizer_path: Optional[str] = None,
+ output_key: Optional[str] = None,
+ use_attention_mask: bool = True,
+ i2v_mode: bool = False,
+ input_max_length: Optional[int] = None,
+ prompt_template: Optional[dict] = None,
+ prompt_template_video: Optional[dict] = None,
+ hidden_state_skip_layer: Optional[int] = None,
+ apply_final_norm: bool = False,
+ reproduce: bool = False,
+ device=None,
+# image_embed_interleave (int): The number of times to interleave the image and text embeddings. Defaults to 2.
+ image_embed_interleave=2,
+ ):
+ super().__init__()
+ self.text_encoder_type = text_encoder_type
+ self.max_length = max_length
+ self.precision = text_encoder_precision
+ self.model_path = text_encoder_path
+ self.tokenizer_type = (
+ tokenizer_type if tokenizer_type is not None else text_encoder_type
+ )
+ self.tokenizer_path = (
+ tokenizer_path if tokenizer_path is not None else None # text_encoder_path
+ )
+ self.use_attention_mask = use_attention_mask
+ if prompt_template_video is not None:
+ assert (
+ use_attention_mask is True
+ ), "Attention mask is True required when training videos."
+ self.input_max_length = (
+ input_max_length if input_max_length is not None else max_length
+ )
+ self.prompt_template = prompt_template
+ self.prompt_template_video = prompt_template_video
+ self.hidden_state_skip_layer = hidden_state_skip_layer
+ self.apply_final_norm = apply_final_norm
+ self.i2v_mode = i2v_mode
+ self.reproduce = reproduce
+ self.image_embed_interleave = image_embed_interleave
+
+ self.use_template = self.prompt_template is not None
+ if self.use_template:
+ assert (
+ isinstance(self.prompt_template, dict)
+ and "template" in self.prompt_template
+ ), f"`prompt_template` must be a dictionary with a key 'template', got {self.prompt_template}"
+ assert "{}" in str(self.prompt_template["template"]), (
+ "`prompt_template['template']` must contain a placeholder `{}` for the input text, "
+ f"got {self.prompt_template['template']}"
+ )
+
+ self.use_video_template = self.prompt_template_video is not None
+ if self.use_video_template:
+ if self.prompt_template_video is not None:
+ assert (
+ isinstance(self.prompt_template_video, dict)
+ and "template" in self.prompt_template_video
+ ), f"`prompt_template_video` must be a dictionary with a key 'template', got {self.prompt_template_video}"
+ assert "{}" in str(self.prompt_template_video["template"]), (
+ "`prompt_template_video['template']` must contain a placeholder `{}` for the input text, "
+ f"got {self.prompt_template_video['template']}"
+ )
+
+ if "t5" in text_encoder_type:
+ self.output_key = output_key or "last_hidden_state"
+ elif "clip" in text_encoder_type:
+ self.output_key = output_key or "pooler_output"
+ elif "llm" in text_encoder_type or "glm" in text_encoder_type:
+ self.output_key = output_key or "last_hidden_state"
+ else:
+ raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
+
+ if "llm" in text_encoder_type:
+ from mmgp import offload
+ forcedConfigPath= None if "i2v" in text_encoder_type else "ckpts/llava-llama-3-8b/config.json"
+ self.model= offload.fast_load_transformers_model(self.model_path, forcedConfigPath=forcedConfigPath, modelPrefix= "model" if forcedConfigPath !=None else None)
+ if forcedConfigPath != None:
+ self.model.final_layer_norm = self.model.norm
+
+ else:
+ self.model, self.model_path = load_text_encoder(
+ text_encoder_type=self.text_encoder_type,
+ text_encoder_precision=self.precision,
+ text_encoder_path=self.model_path,
+ device=device,
+ )
+
+ self.dtype = self.model.dtype
+ self.device = self.model.device
+
+ self.tokenizer, self.tokenizer_path, self.processor = load_tokenizer(
+ tokenizer_type=self.tokenizer_type,
+ tokenizer_path=self.tokenizer_path,
+ padding_side="right",
+ )
+
+ def __repr__(self):
+ return f"{self.text_encoder_type} ({self.precision} - {self.model_path})"
+
+ @staticmethod
+ def apply_text_to_template(text, template, prevent_empty_text=True):
+ """
+ Apply text to template.
+
+ Args:
+ text (str): Input text.
+ template (str or list): Template string or list of chat conversation.
+ prevent_empty_text (bool): If Ture, we will prevent the user text from being empty
+ by adding a space. Defaults to True.
+ """
+ if isinstance(template, str):
+ # Will send string to tokenizer. Used for llm
+ return template.format(text)
+ else:
+ raise TypeError(f"Unsupported template type: {type(template)}")
+
+ def text2tokens(self, text, data_type="image", name = None):
+ """
+ Tokenize the input text.
+
+ Args:
+ text (str or list): Input text.
+ """
+ tokenize_input_type = "str"
+ if self.use_template:
+ if data_type == "image":
+ prompt_template = self.prompt_template["template"]
+ elif data_type == "video":
+ prompt_template = self.prompt_template_video["template"]
+ else:
+ raise ValueError(f"Unsupported data type: {data_type}")
+ if isinstance(text, (list, tuple)):
+ text = [
+ self.apply_text_to_template(one_text, prompt_template)
+ for one_text in text
+ ]
+ if isinstance(text[0], list):
+ tokenize_input_type = "list"
+ elif isinstance(text, str):
+ text = self.apply_text_to_template(text, prompt_template)
+ if isinstance(text, list):
+ tokenize_input_type = "list"
+ else:
+ raise TypeError(f"Unsupported text type: {type(text)}")
+
+ kwargs = dict(truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
+ if self.text_encoder_type == "llm-i2v" and name != None: #llava-llama-3-8b
+ if isinstance(text, list):
+ for i in range(len(text)):
+ text[i] = text[i] + '\nThe %s looks like' % name
+ elif isinstance(text, str):
+ text = text + '\nThe %s looks like' % name
+ else:
+ raise NotImplementedError
+
+ kwargs = dict(
+ truncation=True,
+ max_length=self.max_length,
+ padding="max_length",
+ return_tensors="pt",
+ )
+ if tokenize_input_type == "str":
+ return self.tokenizer(
+ text,
+ return_length=False,
+ return_overflowing_tokens=False,
+ return_attention_mask=True,
+ **kwargs,
+ )
+ elif tokenize_input_type == "list":
+ return self.tokenizer.apply_chat_template(
+ text,
+ add_generation_prompt=True,
+ tokenize=True,
+ return_dict=True,
+ **kwargs,
+ )
+ else:
+ raise ValueError(f"Unsupported tokenize_input_type: {tokenize_input_type}")
+
+ def encode(
+ self,
+ batch_encoding,
+ use_attention_mask=None,
+ output_hidden_states=False,
+ do_sample=None,
+ hidden_state_skip_layer=None,
+ return_texts=False,
+ data_type="image",
+ semantic_images=None,
+ device=None,
+ ):
+ """
+ Args:
+ batch_encoding (dict): Batch encoding from tokenizer.
+ use_attention_mask (bool): Whether to use attention mask. If None, use self.use_attention_mask.
+ Defaults to None.
+ output_hidden_states (bool): Whether to output hidden states. If False, return the value of
+ self.output_key. If True, return the entire output. If set self.hidden_state_skip_layer,
+ output_hidden_states will be set True. Defaults to False.
+ do_sample (bool): Whether to sample from the model. Used for Decoder-Only LLMs. Defaults to None.
+ When self.produce is False, do_sample is set to True by default.
+ hidden_state_skip_layer (int): Number of hidden states to hidden_state_skip_layer. 0 means the last layer.
+ If None, self.output_key will be used. Defaults to None.
+ hidden_state_skip_layer (PIL.Image): The reference images for i2v models.
+ image_embed_interleave (int): The number of times to interleave the image and text embeddings. Defaults to 2.
+ return_texts (bool): Whether to return the decoded texts. Defaults to False.
+ """
+ device = self.model.device if device is None else device
+ use_attention_mask = use_default(use_attention_mask, self.use_attention_mask)
+ hidden_state_skip_layer = use_default(
+ hidden_state_skip_layer, self.hidden_state_skip_layer
+ )
+ do_sample = use_default(do_sample, not self.reproduce)
+ if not self.i2v_mode:
+ attention_mask = (
+ batch_encoding["attention_mask"].to(device)
+ if use_attention_mask
+ else None
+ )
+
+ if 'pixel_value_llava' in batch_encoding:
+ outputs = self.model(
+ input_ids=batch_encoding["input_ids"].to(self.model.device),
+ attention_mask=attention_mask,
+ pixel_values=batch_encoding["pixel_value_llava"].to(self.model.device),
+ output_hidden_states=output_hidden_states or hidden_state_skip_layer is not None)
+ else:
+ outputs = self.model(
+ input_ids=batch_encoding["input_ids"].to(self.model.device),
+ attention_mask=attention_mask,
+ output_hidden_states=output_hidden_states or hidden_state_skip_layer is not None,)
+
+ if hidden_state_skip_layer is not None:
+ last_hidden_state = outputs.hidden_states[
+ -(hidden_state_skip_layer + 1)
+ ]
+ # Real last hidden state already has layer norm applied. So here we only apply it
+ # for intermediate layers.
+ if hidden_state_skip_layer > 0 and self.apply_final_norm:
+ last_hidden_state = self.model.final_layer_norm(last_hidden_state)
+ else:
+ last_hidden_state = outputs[self.output_key]
+
+ # Remove hidden states of instruction tokens, only keep prompt tokens.
+ if self.use_template:
+ if data_type == "image":
+ crop_start = self.prompt_template.get("crop_start", -1)
+ elif data_type == "video":
+ crop_start = self.prompt_template_video.get("crop_start", -1)
+ else:
+ raise ValueError(f"Unsupported data type: {data_type}")
+ if crop_start > 0:
+ last_hidden_state = last_hidden_state[:, crop_start:]
+ attention_mask = (
+ attention_mask[:, crop_start:] if use_attention_mask else None
+ )
+
+ if output_hidden_states:
+ return TextEncoderModelOutput(
+ last_hidden_state, attention_mask, outputs.hidden_states
+ )
+ return TextEncoderModelOutput(last_hidden_state, attention_mask)
+ else:
+ image_outputs = self.processor(semantic_images, return_tensors="pt")[
+ "pixel_values"
+ ].to(device)
+ attention_mask = (
+ batch_encoding["attention_mask"].to(device)
+ if use_attention_mask
+ else None
+ )
+ outputs = self.model(
+ input_ids=batch_encoding["input_ids"].to(device),
+ attention_mask=attention_mask,
+ output_hidden_states=output_hidden_states
+ or hidden_state_skip_layer is not None,
+ pixel_values=image_outputs,
+ )
+ if hidden_state_skip_layer is not None:
+ last_hidden_state = outputs.hidden_states[
+ -(hidden_state_skip_layer + 1)
+ ]
+ # Real last hidden state already has layer norm applied. So here we only apply it
+ # for intermediate layers.
+ if hidden_state_skip_layer > 0 and self.apply_final_norm:
+ last_hidden_state = self.model.final_layer_norm(last_hidden_state)
+ else:
+ last_hidden_state = outputs[self.output_key]
+ if self.use_template:
+ if data_type == "video":
+ crop_start = self.prompt_template_video.get("crop_start", -1)
+ text_crop_start = (
+ crop_start
+ - 1
+ + self.prompt_template_video.get("image_emb_len", 576)
+ )
+ image_crop_start = self.prompt_template_video.get(
+ "image_emb_start", 5
+ )
+ image_crop_end = self.prompt_template_video.get(
+ "image_emb_end", 581
+ )
+ batch_indices, last_double_return_token_indices = torch.where(
+ batch_encoding["input_ids"]
+ == self.prompt_template_video.get("double_return_token_id", 271)
+ )
+ if last_double_return_token_indices.shape[0] == 3:
+ # in case the prompt is too long
+ last_double_return_token_indices = torch.cat(
+ (
+ last_double_return_token_indices,
+ torch.tensor([batch_encoding["input_ids"].shape[-1]]),
+ )
+ )
+ batch_indices = torch.cat((batch_indices, torch.tensor([0])))
+ last_double_return_token_indices = (
+ last_double_return_token_indices.reshape(
+ batch_encoding["input_ids"].shape[0], -1
+ )[:, -1]
+ )
+ batch_indices = batch_indices.reshape(
+ batch_encoding["input_ids"].shape[0], -1
+ )[:, -1]
+ assistant_crop_start = (
+ last_double_return_token_indices
+ - 1
+ + self.prompt_template_video.get("image_emb_len", 576)
+ - 4
+ )
+ assistant_crop_end = (
+ last_double_return_token_indices
+ - 1
+ + self.prompt_template_video.get("image_emb_len", 576)
+ )
+ attention_mask_assistant_crop_start = (
+ last_double_return_token_indices - 4
+ )
+ attention_mask_assistant_crop_end = last_double_return_token_indices
+ else:
+ raise ValueError(f"Unsupported data type: {data_type}")
+ text_last_hidden_state = []
+
+ text_attention_mask = []
+ image_last_hidden_state = []
+ image_attention_mask = []
+ for i in range(batch_encoding["input_ids"].shape[0]):
+ text_last_hidden_state.append(
+ torch.cat(
+ [
+ last_hidden_state[
+ i, text_crop_start : assistant_crop_start[i].item()
+ ],
+ last_hidden_state[i, assistant_crop_end[i].item() :],
+ ]
+ )
+ )
+ text_attention_mask.append(
+ torch.cat(
+ [
+ attention_mask[
+ i,
+ crop_start : attention_mask_assistant_crop_start[
+ i
+ ].item(),
+ ],
+ attention_mask[
+ i, attention_mask_assistant_crop_end[i].item() :
+ ],
+ ]
+ )
+ if use_attention_mask
+ else None
+ )
+ image_last_hidden_state.append(
+ last_hidden_state[i, image_crop_start:image_crop_end]
+ )
+ image_attention_mask.append(
+ torch.ones(image_last_hidden_state[-1].shape[0])
+ .to(last_hidden_state.device)
+ .to(attention_mask.dtype)
+ if use_attention_mask
+ else None
+ )
+
+ text_last_hidden_state = torch.stack(text_last_hidden_state)
+ text_attention_mask = torch.stack(text_attention_mask)
+ image_last_hidden_state = torch.stack(image_last_hidden_state)
+ image_attention_mask = torch.stack(image_attention_mask)
+
+ if semantic_images is not None and 0 < self.image_embed_interleave < 6:
+ image_last_hidden_state = image_last_hidden_state[
+ :, ::self.image_embed_interleave, :
+ ]
+ image_attention_mask = image_attention_mask[
+ :, ::self.image_embed_interleave
+ ]
+
+ assert (
+ text_last_hidden_state.shape[0] == text_attention_mask.shape[0]
+ and image_last_hidden_state.shape[0]
+ == image_attention_mask.shape[0]
+ )
+
+ last_hidden_state = torch.cat(
+ [image_last_hidden_state, text_last_hidden_state], dim=1
+ )
+ attention_mask = torch.cat(
+ [image_attention_mask, text_attention_mask], dim=1
+ )
+ if output_hidden_states:
+ return TextEncoderModelOutput(
+ last_hidden_state,
+ attention_mask,
+ hidden_states_list=outputs.hidden_states,
+ )
+ return TextEncoderModelOutput(last_hidden_state, attention_mask)
+
+ def forward(
+ self,
+ text,
+ use_attention_mask=None,
+ output_hidden_states=False,
+ do_sample=False,
+ hidden_state_skip_layer=None,
+ return_texts=False,
+ ):
+ batch_encoding = self.text2tokens(text)
+ return self.encode(
+ batch_encoding,
+ use_attention_mask=use_attention_mask,
+ output_hidden_states=output_hidden_states,
+ do_sample=do_sample,
+ hidden_state_skip_layer=hidden_state_skip_layer,
+ return_texts=return_texts,
+ )
diff --git a/hyvideo/utils/__init__.py b/hyvideo/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/hyvideo/utils/data_utils.py b/hyvideo/utils/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7960c365104495a71ee923b87d7a58837f9a476
--- /dev/null
+++ b/hyvideo/utils/data_utils.py
@@ -0,0 +1,90 @@
+import numpy as np
+import math
+from PIL import Image
+import torch
+import copy
+import string
+import random
+
+
+def align_to(value, alignment):
+ """align hight, width according to alignment
+
+ Args:
+ value (int): height or width
+ alignment (int): target alignment factor
+
+ Returns:
+ int: the aligned value
+ """
+ return int(math.ceil(value / alignment) * alignment)
+
+
+def black_image(width, height):
+ """generate a black image
+
+ Args:
+ width (int): image width
+ height (int): image height
+
+ Returns:
+ _type_: a black image
+ """
+ black_image = Image.new("RGB", (width, height), (0, 0, 0))
+ return black_image
+
+
+def get_closest_ratio(height: float, width: float, ratios: list, buckets: list):
+ """get the closest ratio in the buckets
+
+ Args:
+ height (float): video height
+ width (float): video width
+ ratios (list): video aspect ratio
+ buckets (list): buckets generate by `generate_crop_size_list`
+
+ Returns:
+ the closest ratio in the buckets and the corresponding ratio
+ """
+ aspect_ratio = float(height) / float(width)
+ closest_ratio_id = np.abs(ratios - aspect_ratio).argmin()
+ closest_ratio = min(ratios, key=lambda ratio: abs(float(ratio) - aspect_ratio))
+ return buckets[closest_ratio_id], float(closest_ratio)
+
+
+def generate_crop_size_list(base_size=256, patch_size=32, max_ratio=4.0):
+ """generate crop size list
+
+ Args:
+ base_size (int, optional): the base size for generate bucket. Defaults to 256.
+ patch_size (int, optional): the stride to generate bucket. Defaults to 32.
+ max_ratio (float, optional): th max ratio for h or w based on base_size . Defaults to 4.0.
+
+ Returns:
+ list: generate crop size list
+ """
+ num_patches = round((base_size / patch_size) ** 2)
+ assert max_ratio >= 1.0
+ crop_size_list = []
+ wp, hp = num_patches, 1
+ while wp > 0:
+ if max(wp, hp) / min(wp, hp) <= max_ratio:
+ crop_size_list.append((wp * patch_size, hp * patch_size))
+ if (hp + 1) * wp <= num_patches:
+ hp += 1
+ else:
+ wp -= 1
+ return crop_size_list
+
+
+def align_floor_to(value, alignment):
+ """align hight, width according to alignment
+
+ Args:
+ value (int): height or width
+ alignment (int): target alignment factor
+
+ Returns:
+ int: the aligned value
+ """
+ return int(math.floor(value / alignment) * alignment)
diff --git a/hyvideo/utils/file_utils.py b/hyvideo/utils/file_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ba36514534c65ddd3d95b26bc71076bfde7e53f
--- /dev/null
+++ b/hyvideo/utils/file_utils.py
@@ -0,0 +1,70 @@
+import os
+from pathlib import Path
+from einops import rearrange
+
+import torch
+import torchvision
+import numpy as np
+import imageio
+
+CODE_SUFFIXES = {
+ ".py", # Python codes
+ ".sh", # Shell scripts
+ ".yaml",
+ ".yml", # Configuration files
+}
+
+
+def safe_dir(path):
+ """
+ Create a directory (or the parent directory of a file) if it does not exist.
+
+ Args:
+ path (str or Path): Path to the directory.
+
+ Returns:
+ path (Path): Path object of the directory.
+ """
+ path = Path(path)
+ path.mkdir(exist_ok=True, parents=True)
+ return path
+
+
+def safe_file(path):
+ """
+ Create the parent directory of a file if it does not exist.
+
+ Args:
+ path (str or Path): Path to the file.
+
+ Returns:
+ path (Path): Path object of the file.
+ """
+ path = Path(path)
+ path.parent.mkdir(exist_ok=True, parents=True)
+ return path
+
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=1, fps=24):
+ """save videos by video tensor
+ copy from https://github.com/guoyww/AnimateDiff/blob/e92bd5671ba62c0d774a32951453e328018b7c5b/animatediff/utils/util.py#L61
+
+ Args:
+ videos (torch.Tensor): video tensor predicted by the model
+ path (str): path to save video
+ rescale (bool, optional): rescale the video tensor from [-1, 1] to . Defaults to False.
+ n_rows (int, optional): Defaults to 1.
+ fps (int, optional): video save fps. Defaults to 8.
+ """
+ videos = rearrange(videos, "b c t h w -> t b c h w")
+ outputs = []
+ for x in videos:
+ x = torchvision.utils.make_grid(x, nrow=n_rows)
+ x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+ if rescale:
+ x = (x + 1.0) / 2.0 # -1,1 -> 0,1
+ x = torch.clamp(x, 0, 1)
+ x = (x * 255).numpy().astype(np.uint8)
+ outputs.append(x)
+
+ os.makedirs(os.path.dirname(path), exist_ok=True)
+ imageio.mimsave(path, outputs, fps=fps)
diff --git a/hyvideo/utils/helpers.py b/hyvideo/utils/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..72ab8cb1feba4ce7782f1ea841fd42c71be7b0d1
--- /dev/null
+++ b/hyvideo/utils/helpers.py
@@ -0,0 +1,40 @@
+import collections.abc
+
+from itertools import repeat
+
+
+def _ntuple(n):
+ def parse(x):
+ if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+ x = tuple(x)
+ if len(x) == 1:
+ x = tuple(repeat(x[0], n))
+ return x
+ return tuple(repeat(x, n))
+ return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+
+
+def as_tuple(x):
+ if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+ return tuple(x)
+ if x is None or isinstance(x, (int, float, str)):
+ return (x,)
+ else:
+ raise ValueError(f"Unknown type {type(x)}")
+
+
+def as_list_of_2tuple(x):
+ x = as_tuple(x)
+ if len(x) == 1:
+ x = (x[0], x[0])
+ assert len(x) % 2 == 0, f"Expect even length, got {len(x)}."
+ lst = []
+ for i in range(0, len(x), 2):
+ lst.append((x[i], x[i + 1]))
+ return lst
diff --git a/hyvideo/utils/preprocess_text_encoder_tokenizer_utils.py b/hyvideo/utils/preprocess_text_encoder_tokenizer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2908eb29fe1cc9741b4000ace4cc01e91fc9037c
--- /dev/null
+++ b/hyvideo/utils/preprocess_text_encoder_tokenizer_utils.py
@@ -0,0 +1,46 @@
+import argparse
+import torch
+from transformers import (
+ AutoProcessor,
+ LlavaForConditionalGeneration,
+)
+
+
+def preprocess_text_encoder_tokenizer(args):
+
+ processor = AutoProcessor.from_pretrained(args.input_dir)
+ model = LlavaForConditionalGeneration.from_pretrained(
+ args.input_dir,
+ torch_dtype=torch.float16,
+ low_cpu_mem_usage=True,
+ ).to(0)
+
+ model.language_model.save_pretrained(
+ f"{args.output_dir}"
+ )
+ processor.tokenizer.save_pretrained(
+ f"{args.output_dir}"
+ )
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--input_dir",
+ type=str,
+ required=True,
+ help="The path to the llava-llama-3-8b-v1_1-transformers.",
+ )
+ parser.add_argument(
+ "--output_dir",
+ type=str,
+ default="",
+ help="The output path of the llava-llama-3-8b-text-encoder-tokenizer."
+ "if '', the parent dir of output will be the same as input dir.",
+ )
+ args = parser.parse_args()
+
+ if len(args.output_dir) == 0:
+ args.output_dir = "/".join(args.input_dir.split("/")[:-1])
+
+ preprocess_text_encoder_tokenizer(args)
diff --git a/hyvideo/vae/__init__.py b/hyvideo/vae/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c0032dccca096b13ae396be7964d01188fb6e8
--- /dev/null
+++ b/hyvideo/vae/__init__.py
@@ -0,0 +1,76 @@
+from pathlib import Path
+
+import torch
+
+from .autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from ..constants import VAE_PATH, PRECISION_TO_TYPE
+
+def load_vae(vae_type: str="884-16c-hy",
+ vae_precision: str=None,
+ sample_size: tuple=None,
+ vae_path: str=None,
+ vae_config_path: str=None,
+ logger=None,
+ device=None
+ ):
+ """the fucntion to load the 3D VAE model
+
+ Args:
+ vae_type (str): the type of the 3D VAE model. Defaults to "884-16c-hy".
+ vae_precision (str, optional): the precision to load vae. Defaults to None.
+ sample_size (tuple, optional): the tiling size. Defaults to None.
+ vae_path (str, optional): the path to vae. Defaults to None.
+ logger (_type_, optional): logger. Defaults to None.
+ device (_type_, optional): device to load vae. Defaults to None.
+ """
+ if vae_path is None:
+ vae_path = VAE_PATH[vae_type]
+
+ if logger is not None:
+ logger.info(f"Loading 3D VAE model ({vae_type}) from: {vae_path}")
+
+ # config = AutoencoderKLCausal3D.load_config("ckpts/hunyuan_video_VAE_config.json")
+ # config = AutoencoderKLCausal3D.load_config("c:/temp/hvae/config_vae.json")
+ config = AutoencoderKLCausal3D.load_config(vae_config_path)
+ if sample_size:
+ vae = AutoencoderKLCausal3D.from_config(config, sample_size=sample_size)
+ else:
+ vae = AutoencoderKLCausal3D.from_config(config)
+
+ vae_ckpt = Path(vae_path)
+ # vae_ckpt = Path("ckpts/hunyuan_video_VAE.pt")
+ # vae_ckpt = Path("c:/temp/hvae/pytorch_model.pt")
+ assert vae_ckpt.exists(), f"VAE checkpoint not found: {vae_ckpt}"
+
+ from mmgp import offload
+
+ # ckpt = torch.load(vae_ckpt, weights_only=True, map_location=vae.device)
+ # if "state_dict" in ckpt:
+ # ckpt = ckpt["state_dict"]
+ # if any(k.startswith("vae.") for k in ckpt.keys()):
+ # ckpt = {k.replace("vae.", ""): v for k, v in ckpt.items() if k.startswith("vae.")}
+ # a,b = vae.load_state_dict(ckpt)
+
+ # offload.save_model(vae, "vae_32.safetensors")
+ # vae.to(torch.bfloat16)
+ # offload.save_model(vae, "vae_16.safetensors")
+ offload.load_model_data(vae, vae_path )
+ # ckpt = torch.load(vae_ckpt, weights_only=True, map_location=vae.device)
+
+ spatial_compression_ratio = vae.config.spatial_compression_ratio
+ time_compression_ratio = vae.config.time_compression_ratio
+
+ if vae_precision is not None:
+ vae = vae.to(dtype=PRECISION_TO_TYPE[vae_precision])
+
+ vae.requires_grad_(False)
+
+ if logger is not None:
+ logger.info(f"VAE to dtype: {vae.dtype}")
+
+ if device is not None:
+ vae = vae.to(device)
+
+ vae.eval()
+
+ return vae, vae_path, spatial_compression_ratio, time_compression_ratio
diff --git a/hyvideo/vae/autoencoder_kl_causal_3d.py b/hyvideo/vae/autoencoder_kl_causal_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ea42fe86addc272c307e1c387cf48c2bffa68f3
--- /dev/null
+++ b/hyvideo/vae/autoencoder_kl_causal_3d.py
@@ -0,0 +1,927 @@
+import os
+import math
+from typing import Dict, Optional, Tuple, Union
+from dataclasses import dataclass
+from torch import distributed as dist
+import loguru
+import torch
+import torch.nn as nn
+import torch.distributed
+
+RECOMMENDED_DTYPE = torch.float16
+
+def mpi_comm():
+ from mpi4py import MPI
+ return MPI.COMM_WORLD
+
+from torch import distributed as dist
+def mpi_rank():
+ return dist.get_rank()
+
+def mpi_world_size():
+ return dist.get_world_size()
+
+
+class TorchIGather:
+ def __init__(self):
+ if not torch.distributed.is_initialized():
+ rank = mpi_rank()
+ world_size = mpi_world_size()
+ os.environ['RANK'] = str(rank)
+ os.environ['WORLD_SIZE'] = str(world_size)
+ os.environ['MASTER_ADDR'] = '127.0.0.1'
+ os.environ['MASTER_PORT'] = str(29500)
+ torch.cuda.set_device(rank)
+ torch.distributed.init_process_group('nccl')
+
+ self.handles = []
+ self.buffers = []
+
+ self.world_size = dist.get_world_size()
+ self.rank = dist.get_rank()
+ self.groups_ids = []
+ self.group = {}
+
+ for i in range(self.world_size):
+ self.groups_ids.append(tuple(range(i + 1)))
+
+ for group in self.groups_ids:
+ new_group = dist.new_group(group)
+ self.group[group[-1]] = new_group
+
+
+ def gather(self, tensor, n_rank=None):
+ if n_rank is not None:
+ group = self.group[n_rank - 1]
+ else:
+ group = None
+ rank = self.rank
+ tensor = tensor.to(RECOMMENDED_DTYPE)
+ if rank == 0:
+ buffer = [torch.empty_like(tensor) for i in range(n_rank)]
+ else:
+ buffer = None
+ self.buffers.append(buffer)
+ handle = torch.distributed.gather(tensor, buffer, async_op=True, group=group)
+ self.handles.append(handle)
+
+ def wait(self):
+ for handle in self.handles:
+ handle.wait()
+
+ def clear(self):
+ self.buffers = []
+ self.handles = []
+
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+try:
+ # This diffusers is modified and packed in the mirror.
+ from diffusers.loaders import FromOriginalVAEMixin
+except ImportError:
+ # Use this to be compatible with the original diffusers.
+ from diffusers.loaders.single_file_model import FromOriginalModelMixin as FromOriginalVAEMixin
+from diffusers.utils.accelerate_utils import apply_forward_hook
+from diffusers.models.attention_processor import (
+ ADDED_KV_ATTENTION_PROCESSORS,
+ CROSS_ATTENTION_PROCESSORS,
+ Attention,
+ AttentionProcessor,
+ AttnAddedKVProcessor,
+ AttnProcessor,
+)
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.modeling_utils import ModelMixin
+from .vae import DecoderCausal3D, BaseOutput, DecoderOutput, DiagonalGaussianDistribution, EncoderCausal3D
+
+# """
+# use trt need install polygraphy and onnx-graphsurgeon
+# python3 -m pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
+# """
+# try:
+# from polygraphy.backend.trt import ( TrtRunner, EngineFromBytes)
+# from polygraphy.backend.common import BytesFromPath
+# except:
+# print("TrtRunner or EngineFromBytes is not available, you can not use trt engine")
+
+@dataclass
+class DecoderOutput2(BaseOutput):
+ sample: torch.FloatTensor
+ posterior: Optional[DiagonalGaussianDistribution] = None
+
+
+MODEL_OUTPUT_PATH = os.environ.get('MODEL_OUTPUT_PATH')
+MODEL_BASE = os.environ.get('MODEL_BASE')
+
+
+class AutoencoderKLCausal3D(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
+ r"""
+ A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
+
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+ for all models (such as downloading or saving).
+
+ Parameters:
+ in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+ out_channels (int, *optional*, defaults to 3): Number of channels in the output.
+ down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+ Tuple of downsample block types.
+ up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+ Tuple of upsample block types.
+ block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+ Tuple of block output channels.
+ act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+ latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
+ sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+ scaling_factor (`float`, *optional*, defaults to 0.18215):
+ The component-wise standard deviation of the trained latent space computed using the first batch of the
+ training set. This is used to scale the latent space to have unit variance when training the diffusion
+ model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+ diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+ / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+ Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+ force_upcast (`bool`, *optional*, default to `True`):
+ If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+ can be fine-tuned / trained to a lower range without loosing too much precision in which case
+ `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
+ """
+
+ def get_VAE_tile_size(self, vae_config, device_mem_capacity, mixed_precision):
+ if mixed_precision:
+ device_mem_capacity /= 1.5
+ if vae_config == 0:
+ if device_mem_capacity >= 24000:
+ use_vae_config = 1
+ elif device_mem_capacity >= 12000:
+ use_vae_config = 2
+ else:
+ use_vae_config = 3
+ else:
+ use_vae_config = vae_config
+
+ if use_vae_config == 1:
+ sample_tsize = 32
+ sample_size = 256
+ elif use_vae_config == 2:
+ sample_tsize = 16
+ sample_size = 256
+ else:
+ sample_tsize = 16
+ sample_size = 192
+
+ VAE_tiling = {
+ "tile_sample_min_tsize" : sample_tsize,
+ "tile_latent_min_tsize" : sample_tsize // self.time_compression_ratio,
+ "tile_sample_min_size" : sample_size,
+ "tile_latent_min_size" : int(sample_size / (2 ** (len(self.config.block_out_channels) - 1))),
+ "tile_overlap_factor" : 0.25
+ }
+ return VAE_tiling
+ _supports_gradient_checkpointing = True
+
+ @register_to_config
+ def __init__(
+ self,
+ in_channels: int = 3,
+ out_channels: int = 3,
+ down_block_types: Tuple[str] = ("DownEncoderBlockCausal3D",),
+ up_block_types: Tuple[str] = ("UpDecoderBlockCausal3D",),
+ block_out_channels: Tuple[int] = (64,),
+ layers_per_block: int = 1,
+ act_fn: str = "silu",
+ latent_channels: int = 4,
+ norm_num_groups: int = 32,
+ sample_size: int = 32,
+ sample_tsize: int = 64,
+ scaling_factor: float = 0.18215,
+ force_upcast: float = True,
+ spatial_compression_ratio: int = 8,
+ time_compression_ratio: int = 4,
+ disable_causal_conv: bool = False,
+ mid_block_add_attention: bool = True,
+ mid_block_causal_attn: bool = False,
+ use_trt_engine: bool = False,
+ nccl_gather: bool = True,
+ engine_path: str = f"{MODEL_BASE}/HYVAE_decoder+conv_256x256xT_fp16_H20.engine",
+ ):
+ super().__init__()
+
+ self.disable_causal_conv = disable_causal_conv
+ self.time_compression_ratio = time_compression_ratio
+
+ self.encoder = EncoderCausal3D(
+ in_channels=in_channels,
+ out_channels=latent_channels,
+ down_block_types=down_block_types,
+ block_out_channels=block_out_channels,
+ layers_per_block=layers_per_block,
+ act_fn=act_fn,
+ norm_num_groups=norm_num_groups,
+ double_z=True,
+ time_compression_ratio=time_compression_ratio,
+ spatial_compression_ratio=spatial_compression_ratio,
+ disable_causal=disable_causal_conv,
+ mid_block_add_attention=mid_block_add_attention,
+ mid_block_causal_attn=mid_block_causal_attn,
+ )
+
+ self.decoder = DecoderCausal3D(
+ in_channels=latent_channels,
+ out_channels=out_channels,
+ up_block_types=up_block_types,
+ block_out_channels=block_out_channels,
+ layers_per_block=layers_per_block,
+ norm_num_groups=norm_num_groups,
+ act_fn=act_fn,
+ time_compression_ratio=time_compression_ratio,
+ spatial_compression_ratio=spatial_compression_ratio,
+ disable_causal=disable_causal_conv,
+ mid_block_add_attention=mid_block_add_attention,
+ mid_block_causal_attn=mid_block_causal_attn,
+ )
+
+ self.quant_conv = nn.Conv3d(2 * latent_channels, 2 * latent_channels, kernel_size=1)
+ self.post_quant_conv = nn.Conv3d(latent_channels, latent_channels, kernel_size=1)
+
+ self.use_slicing = False
+ self.use_spatial_tiling = False
+ self.use_temporal_tiling = False
+
+
+ # only relevant if vae tiling is enabled
+ self.tile_sample_min_tsize = sample_tsize
+ self.tile_latent_min_tsize = sample_tsize // time_compression_ratio
+
+ self.tile_sample_min_size = self.config.sample_size
+ sample_size = (
+ self.config.sample_size[0]
+ if isinstance(self.config.sample_size, (list, tuple))
+ else self.config.sample_size
+ )
+ self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
+ self.tile_overlap_factor = 0.25
+
+ use_trt_engine = False #if CPU_OFFLOAD else True
+ # ============= parallism related code ===================
+ self.parallel_decode = use_trt_engine
+ self.nccl_gather = nccl_gather
+
+ # only relevant if parallel_decode is enabled
+ self.gather_to_rank0 = self.parallel_decode
+
+ self.engine_path = engine_path
+
+ self.use_trt_decoder = use_trt_engine
+
+ @property
+ def igather(self):
+ assert self.nccl_gather and self.gather_to_rank0
+ if hasattr(self, '_igather'):
+ return self._igather
+ else:
+ self._igather = TorchIGather()
+ return self._igather
+
+ @property
+ def use_padding(self):
+ return (
+ self.use_trt_decoder
+ # dist.gather demands all processes possess to have the same tile shape.
+ or (self.nccl_gather and self.gather_to_rank0)
+ )
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, (EncoderCausal3D, DecoderCausal3D)):
+ module.gradient_checkpointing = value
+
+ def enable_temporal_tiling(self, use_tiling: bool = True):
+ self.use_temporal_tiling = use_tiling
+
+ def disable_temporal_tiling(self):
+ self.enable_temporal_tiling(False)
+
+ def enable_spatial_tiling(self, use_tiling: bool = True):
+ self.use_spatial_tiling = use_tiling
+
+ def disable_spatial_tiling(self):
+ self.enable_spatial_tiling(False)
+
+ def enable_tiling(self, use_tiling: bool = True):
+ r"""
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+ processing larger images.
+ """
+ self.enable_spatial_tiling(use_tiling)
+ self.enable_temporal_tiling(use_tiling)
+
+ def disable_tiling(self):
+ r"""
+ Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+ decoding in one step.
+ """
+ self.disable_spatial_tiling()
+ self.disable_temporal_tiling()
+
+ def enable_slicing(self):
+ r"""
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+ """
+ self.use_slicing = True
+
+ def disable_slicing(self):
+ r"""
+ Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+ decoding in one step.
+ """
+ self.use_slicing = False
+
+
+ def load_trt_decoder(self):
+ self.use_trt_decoder = True
+ self.engine = EngineFromBytes(BytesFromPath(self.engine_path))
+
+ self.trt_decoder_runner = TrtRunner(self.engine)
+ self.activate_trt_decoder()
+
+ def disable_trt_decoder(self):
+ self.use_trt_decoder = False
+ del self.engine
+
+ def activate_trt_decoder(self):
+ self.trt_decoder_runner.activate()
+
+ def deactivate_trt_decoder(self):
+ self.trt_decoder_runner.deactivate()
+
+ @property
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
+ r"""
+ Returns:
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
+ indexed by its weight name.
+ """
+ # set recursively
+ processors = {}
+
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+ if hasattr(module, "get_processor"):
+ processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+ for sub_name, child in module.named_children():
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+ return processors
+
+ for name, module in self.named_children():
+ fn_recursive_add_processors(name, module, processors)
+
+ return processors
+
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+ def set_attn_processor(
+ self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+ ):
+ r"""
+ Sets the attention processor to use to compute attention.
+
+ Parameters:
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
+ for **all** `Attention` layers.
+
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+ processor. This is strongly recommended when setting trainable attention processors.
+
+ """
+ count = len(self.attn_processors.keys())
+
+ if isinstance(processor, dict) and len(processor) != count:
+ raise ValueError(
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+ )
+
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+ if hasattr(module, "set_processor"):
+ if not isinstance(processor, dict):
+ module.set_processor(processor, _remove_lora=_remove_lora)
+ else:
+ module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+
+ for sub_name, child in module.named_children():
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+ for name, module in self.named_children():
+ fn_recursive_attn_processor(name, module, processor)
+
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+ def set_default_attn_processor(self):
+ """
+ Disables custom attention processors and sets the default attention implementation.
+ """
+ if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+ processor = AttnAddedKVProcessor()
+ elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+ processor = AttnProcessor()
+ else:
+ raise ValueError(
+ f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+ )
+
+ self.set_attn_processor(processor, _remove_lora=True)
+
+ @apply_forward_hook
+ def encode(
+ self, x: torch.FloatTensor, return_dict: bool = True
+ ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+ """
+ Encode a batch of images into latents.
+
+ Args:
+ x (`torch.FloatTensor`): Input batch of images.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+ Returns:
+ The latent representations of the encoded images. If `return_dict` is True, a
+ [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+ """
+ assert len(x.shape) == 5, "The input tensor should have 5 dimensions"
+
+ if self.use_temporal_tiling and x.shape[2] > self.tile_sample_min_tsize:
+ return self.temporal_tiled_encode(x, return_dict=return_dict)
+
+ if self.use_spatial_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+ return self.spatial_tiled_encode(x, return_dict=return_dict)
+
+ if self.use_slicing and x.shape[0] > 1:
+ encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
+ h = torch.cat(encoded_slices)
+ else:
+ h = self.encoder(x)
+
+ moments = self.quant_conv(h)
+ posterior = DiagonalGaussianDistribution(moments)
+
+ if not return_dict:
+ return (posterior,)
+
+ return AutoencoderKLOutput(latent_dist=posterior)
+
+ def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+ assert len(z.shape) == 5, "The input tensor should have 5 dimensions"
+
+ if self.use_temporal_tiling and z.shape[2] > self.tile_latent_min_tsize:
+ return self.temporal_tiled_decode(z, return_dict=return_dict)
+
+ if self.use_spatial_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+ return self.spatial_tiled_decode(z, return_dict=return_dict)
+
+ if self.use_trt_decoder:
+ # For unknown reason, `copy_outputs_to_host` must be set to True
+ dec = self.trt_decoder_runner.infer({"input": z.to(RECOMMENDED_DTYPE).contiguous()}, copy_outputs_to_host=True)["output"].to(device=z.device, dtype=z.dtype)
+ else:
+ z = self.post_quant_conv(z)
+ dec = self.decoder(z)
+
+ if not return_dict:
+ return (dec,)
+
+ return DecoderOutput(sample=dec)
+
+ @apply_forward_hook
+ def decode(
+ self, z: torch.FloatTensor, return_dict: bool = True, generator=None
+ ) -> Union[DecoderOutput, torch.FloatTensor]:
+ """
+ Decode a batch of images.
+
+ Args:
+ z (`torch.FloatTensor`): Input batch of latent vectors.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~models.vae.DecoderOutput`] or `tuple`:
+ If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+ returned.
+
+ """
+
+ if self.parallel_decode:
+ if z.dtype != RECOMMENDED_DTYPE:
+ loguru.logger.warning(
+ f'For better performance, using {RECOMMENDED_DTYPE} for both latent features and model parameters is recommended.'
+ f'Current latent dtype {z.dtype}. '
+ f'Please note that the input latent will be cast to {RECOMMENDED_DTYPE} internally when decoding.'
+ )
+ z = z.to(RECOMMENDED_DTYPE)
+
+ if self.use_slicing and z.shape[0] > 1:
+ decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+ decoded = torch.cat(decoded_slices)
+ else:
+ decoded = self._decode(z).sample
+
+ if not return_dict:
+ return (decoded,)
+
+ return DecoderOutput(sample=decoded)
+
+ def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+ blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+ if blend_extent == 0:
+ return b
+
+ a_region = a[..., -blend_extent:, :]
+ b_region = b[..., :blend_extent, :]
+
+ weights = torch.arange(blend_extent, device=a.device, dtype=a.dtype) / blend_extent
+ weights = weights.view(1, 1, 1, blend_extent, 1)
+
+ blended = a_region * (1 - weights) + b_region * weights
+
+ b[..., :blend_extent, :] = blended
+ return b
+
+ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+ blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+ if blend_extent == 0:
+ return b
+
+ a_region = a[..., -blend_extent:]
+ b_region = b[..., :blend_extent]
+
+ weights = torch.arange(blend_extent, device=a.device, dtype=a.dtype) / blend_extent
+ weights = weights.view(1, 1, 1, 1, blend_extent)
+
+ blended = a_region * (1 - weights) + b_region * weights
+
+ b[..., :blend_extent] = blended
+ return b
+ def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+ blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+ if blend_extent == 0:
+ return b
+
+ a_region = a[..., -blend_extent:, :, :]
+ b_region = b[..., :blend_extent, :, :]
+
+ weights = torch.arange(blend_extent, device=a.device, dtype=a.dtype) / blend_extent
+ weights = weights.view(1, 1, blend_extent, 1, 1)
+
+ blended = a_region * (1 - weights) + b_region * weights
+
+ b[..., :blend_extent, :, :] = blended
+ return b
+
+ def spatial_tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True, return_moments: bool = False) -> AutoencoderKLOutput:
+ r"""Encode a batch of images using a tiled encoder.
+
+ When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+ steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+ different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+ tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+ output, but they should be much less noticeable.
+
+ Args:
+ x (`torch.FloatTensor`): Input batch of images.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+ If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
+ `tuple` is returned.
+ """
+ overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+ blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+ row_limit = self.tile_latent_min_size - blend_extent
+
+ # Split video into tiles and encode them separately.
+ rows = []
+ for i in range(0, x.shape[-2], overlap_size):
+ row = []
+ for j in range(0, x.shape[-1], overlap_size):
+ tile = x[:, :, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+ tile = self.encoder(tile)
+ tile = self.quant_conv(tile)
+ row.append(tile)
+ rows.append(row)
+ result_rows = []
+ for i, row in enumerate(rows):
+ result_row = []
+ for j, tile in enumerate(row):
+ # blend the above tile and the left tile
+ # to the current tile and add the current tile to the result row
+ if i > 0:
+ tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+ if j > 0:
+ tile = self.blend_h(row[j - 1], tile, blend_extent)
+ result_row.append(tile[:, :, :, :row_limit, :row_limit])
+ result_rows.append(torch.cat(result_row, dim=-1))
+
+ moments = torch.cat(result_rows, dim=-2)
+ if return_moments:
+ return moments
+
+ posterior = DiagonalGaussianDistribution(moments)
+ if not return_dict:
+ return (posterior,)
+
+ return AutoencoderKLOutput(latent_dist=posterior)
+
+
+ def spatial_tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+ r"""
+ Decode a batch of images using a tiled decoder.
+
+ Args:
+ z (`torch.FloatTensor`): Input batch of latent vectors.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~models.vae.DecoderOutput`] or `tuple`:
+ If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+ returned.
+ """
+ overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+ blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+ row_limit = self.tile_sample_min_size - blend_extent
+
+ # Split z into overlapping tiles and decode them separately.
+ # The tiles have an overlap to avoid seams between tiles.
+ if self.parallel_decode:
+
+ rank = mpi_rank()
+ torch.cuda.set_device(rank) # set device for trt_runner
+ world_size = mpi_world_size()
+
+ tiles = []
+ afters_if_padding = []
+ for i in range(0, z.shape[-2], overlap_size):
+ for j in range(0, z.shape[-1], overlap_size):
+ tile = z[:, :, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
+
+ if self.use_padding and (tile.shape[-2] < self.tile_latent_min_size or tile.shape[-1] < self.tile_latent_min_size):
+ from torch.nn import functional as F
+ after_h = tile.shape[-2] * 8
+ after_w = tile.shape[-1] * 8
+ padding = (0, self.tile_latent_min_size - tile.shape[-1], 0, self.tile_latent_min_size - tile.shape[-2], 0, 0)
+ tile = F.pad(tile, padding, "replicate").to(device=tile.device, dtype=tile.dtype)
+ afters_if_padding.append((after_h, after_w))
+ else:
+ afters_if_padding.append(None)
+
+ tiles.append(tile)
+
+
+ # balance tasks
+ ratio = math.ceil(len(tiles) / world_size)
+ tiles_curr_rank = tiles[rank * ratio: None if rank == world_size - 1 else (rank + 1) * ratio]
+
+ decoded_results = []
+
+
+ total = len(tiles)
+ n_task = ([ratio] * (total // ratio) + ([total % ratio] if total % ratio else []))
+ n_task = n_task + [0] * (8 - len(n_task))
+
+ for i, tile in enumerate(tiles_curr_rank):
+ if self.use_trt_decoder:
+ # For unknown reason, `copy_outputs_to_host` must be set to True
+ decoded = self.trt_decoder_runner.infer(
+ {"input": tile.to(RECOMMENDED_DTYPE).contiguous()},
+ copy_outputs_to_host=True
+ )["output"].to(device=z.device, dtype=z.dtype)
+ decoded_results.append(decoded)
+ else:
+ decoded_results.append(self.decoder(self.post_quant_conv(tile)))
+
+
+ def find(n):
+ return next((i for i, task_n in enumerate(n_task) if task_n < n), len(n_task))
+
+
+ if self.nccl_gather and self.gather_to_rank0:
+ self.igather.gather(decoded, n_rank=find(i + 1))
+
+ if not self.nccl_gather:
+ if self.gather_to_rank0:
+ decoded_results = mpi_comm().gather(decoded_results, root=0)
+ if rank != 0:
+ return DecoderOutput(sample=None)
+ else:
+ decoded_results = mpi_comm().allgather(decoded_results)
+
+ decoded_results = sum(decoded_results, [])
+ else:
+ # [Kevin]:
+ # We expect all tiles obtained from the same rank have the same shape.
+ # Shapes among ranks can differ due to the imbalance of task assignment.
+ if self.gather_to_rank0:
+ if rank == 0:
+ self.igather.wait()
+ gather_results = self.igather.buffers
+ self.igather.clear()
+ else:
+ raise NotImplementedError('The old `allgather` implementation is deprecated for nccl plan.')
+
+ if rank != 0 and self.gather_to_rank0:
+ return DecoderOutput(sample=None)
+
+ decoded_results = [col[i] for i in range(max([len(k) for k in gather_results])) for col in gather_results if i < len(col)]
+
+
+ # Crop the padding region in pixel level
+ if self.use_padding:
+ new_decoded_results = []
+ for after, dec in zip(afters_if_padding, decoded_results):
+ if after is not None:
+ after_h, after_w = after
+ new_decoded_results.append(dec[:, :, :, :after_h, :after_w])
+ else:
+ new_decoded_results.append(dec)
+ decoded_results = new_decoded_results
+
+ rows = []
+ decoded_results_iter = iter(decoded_results)
+ for i in range(0, z.shape[-2], overlap_size):
+ row = []
+ for j in range(0, z.shape[-1], overlap_size):
+ row.append(next(decoded_results_iter).to(rank))
+ rows.append(row)
+ else:
+ rows = []
+ for i in range(0, z.shape[-2], overlap_size):
+ row = []
+ for j in range(0, z.shape[-1], overlap_size):
+ tile = z[:, :, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
+ tile = self.post_quant_conv(tile)
+ decoded = self.decoder(tile)
+ row.append(decoded)
+ rows.append(row)
+
+ result_rows = []
+ for i, row in enumerate(rows):
+ result_row = []
+ for j, tile in enumerate(row):
+ # blend the above tile and the left tile
+ # to the current tile and add the current tile to the result row
+ if i > 0:
+ tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+ if j > 0:
+ tile = self.blend_h(row[j - 1], tile, blend_extent)
+ result_row.append(tile[:, :, :, :row_limit, :row_limit])
+ result_rows.append(torch.cat(result_row, dim=-1))
+
+ dec = torch.cat(result_rows, dim=-2)
+ if not return_dict:
+ return (dec,)
+
+ return DecoderOutput(sample=dec)
+
+ def temporal_tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
+ assert not self.disable_causal_conv, "Temporal tiling is only compatible with causal convolutions."
+
+ B, C, T, H, W = x.shape
+ overlap_size = int(self.tile_sample_min_tsize * (1 - self.tile_overlap_factor))
+ blend_extent = int(self.tile_latent_min_tsize * self.tile_overlap_factor)
+ t_limit = self.tile_latent_min_tsize - blend_extent
+
+ # Split the video into tiles and encode them separately.
+ row = []
+ for i in range(0, T, overlap_size):
+ tile = x[:, :, i : i + self.tile_sample_min_tsize + 1, :, :]
+ if self.use_spatial_tiling and (tile.shape[-1] > self.tile_sample_min_size or tile.shape[-2] > self.tile_sample_min_size):
+ tile = self.spatial_tiled_encode(tile, return_moments=True)
+ else:
+ tile = self.encoder(tile)
+ tile = self.quant_conv(tile)
+ if i > 0:
+ tile = tile[:, :, 1:, :, :]
+ row.append(tile)
+ result_row = []
+ for i, tile in enumerate(row):
+ if i > 0:
+ tile = self.blend_t(row[i - 1], tile, blend_extent)
+ result_row.append(tile[:, :, :t_limit, :, :])
+ else:
+ result_row.append(tile[:, :, :t_limit+1, :, :])
+
+ moments = torch.cat(result_row, dim=2)
+ posterior = DiagonalGaussianDistribution(moments)
+
+ if not return_dict:
+ return (posterior,)
+
+ return AutoencoderKLOutput(latent_dist=posterior)
+
+ def temporal_tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+ # Split z into overlapping tiles and decode them separately.
+
+ B, C, T, H, W = z.shape
+ overlap_size = int(self.tile_latent_min_tsize * (1 - self.tile_overlap_factor))
+ blend_extent = int(self.tile_sample_min_tsize * self.tile_overlap_factor)
+ t_limit = self.tile_sample_min_tsize - blend_extent
+
+ row = []
+ for i in range(0, T, overlap_size):
+ tile = z[:, :, i: i + self.tile_latent_min_tsize + 1, :, :]
+ if self.use_spatial_tiling and (tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size):
+ decoded = self.spatial_tiled_decode(tile, return_dict=True).sample
+ else:
+ tile = self.post_quant_conv(tile)
+ decoded = self.decoder(tile)
+ if i > 0:
+ decoded = decoded[:, :, 1:, :, :]
+ row.append(decoded)
+ result_row = []
+ for i, tile in enumerate(row):
+ if i > 0:
+ tile = self.blend_t(row[i - 1], tile, blend_extent)
+ result_row.append(tile[:, :, :t_limit, :, :])
+ else:
+ result_row.append(tile[:, :, :t_limit + 1, :, :])
+
+ dec = torch.cat(result_row, dim=2)
+ if not return_dict:
+ return (dec,)
+
+ return DecoderOutput(sample=dec)
+
+ def forward(
+ self,
+ sample: torch.FloatTensor,
+ sample_posterior: bool = False,
+ return_dict: bool = True,
+ return_posterior: bool = False,
+ generator: Optional[torch.Generator] = None,
+ ) -> Union[DecoderOutput2, torch.FloatTensor]:
+ r"""
+ Args:
+ sample (`torch.FloatTensor`): Input sample.
+ sample_posterior (`bool`, *optional*, defaults to `False`):
+ Whether to sample from the posterior.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+ """
+ x = sample
+ posterior = self.encode(x).latent_dist
+ if sample_posterior:
+ z = posterior.sample(generator=generator)
+ else:
+ z = posterior.mode()
+ dec = self.decode(z).sample
+
+ if not return_dict:
+ if return_posterior:
+ return (dec, posterior)
+ else:
+ return (dec,)
+ if return_posterior:
+ return DecoderOutput2(sample=dec, posterior=posterior)
+ else:
+ return DecoderOutput2(sample=dec)
+
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+ def fuse_qkv_projections(self):
+ """
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+ key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+
+
+ This API is 🧪 experimental.
+
+
+ """
+ self.original_attn_processors = None
+
+ for _, attn_processor in self.attn_processors.items():
+ if "Added" in str(attn_processor.__class__.__name__):
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+ self.original_attn_processors = self.attn_processors
+
+ for module in self.modules():
+ if isinstance(module, Attention):
+ module.fuse_projections(fuse=True)
+
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+ def unfuse_qkv_projections(self):
+ """Disables the fused QKV projection if enabled.
+
+
+
+ This API is 🧪 experimental.
+
+
+
+ """
+ if self.original_attn_processors is not None:
+ self.set_attn_processor(self.original_attn_processors)
diff --git a/hyvideo/vae/unet_causal_3d_blocks.py b/hyvideo/vae/unet_causal_3d_blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..021c7cf21b2da91ded87c1710451b1cf21b47c46
--- /dev/null
+++ b/hyvideo/vae/unet_causal_3d_blocks.py
@@ -0,0 +1,884 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from einops import rearrange
+
+from diffusers.utils import is_torch_version, logging
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import SpatialNorm
+from diffusers.models.attention_processor import Attention
+from diffusers.models.normalization import AdaGroupNorm
+from diffusers.models.normalization import RMSNorm
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+def prepare_causal_attention_mask(n_frame: int, n_hw: int, dtype, device, batch_size: int = None):
+ seq_len = n_frame * n_hw
+ mask = torch.full((seq_len, seq_len), float("-inf"), dtype=dtype, device=device)
+ for i in range(seq_len):
+ i_frame = i // n_hw
+ mask[i, : (i_frame + 1) * n_hw] = 0
+ if batch_size is not None:
+ mask = mask.unsqueeze(0).expand(batch_size, -1, -1)
+ return mask
+
+
+class CausalConv3d(nn.Module):
+ def __init__(
+ self,
+ chan_in,
+ chan_out,
+ kernel_size: Union[int, Tuple[int, int, int]],
+ stride: Union[int, Tuple[int, int, int]] = 1,
+ dilation: Union[int, Tuple[int, int, int]] = 1,
+ pad_mode = 'replicate',
+ disable_causal=False,
+ **kwargs
+ ):
+ super().__init__()
+
+ self.pad_mode = pad_mode
+ if disable_causal:
+ padding = (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2)
+ else:
+ padding = (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size - 1, 0) # W, H, T
+ self.time_causal_padding = padding
+
+ self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride = stride, dilation = dilation, **kwargs)
+
+ def forward(self, x):
+ x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
+ return self.conv(x)
+
+class CausalAvgPool3d(nn.Module):
+ def __init__(
+ self,
+ kernel_size: Union[int, Tuple[int, int, int]],
+ stride: Union[int, Tuple[int, int, int]],
+ pad_mode = 'replicate',
+ disable_causal=False,
+ **kwargs
+ ):
+ super().__init__()
+
+ self.pad_mode = pad_mode
+ if disable_causal:
+ padding = (0, 0, 0, 0, 0, 0)
+ else:
+ padding = (0, 0, 0, 0, stride - 1, 0) # W, H, T
+ self.time_causal_padding = padding
+
+ self.conv = nn.AvgPool3d(kernel_size, stride=stride, ceil_mode=True, **kwargs)
+ self.pad_mode = pad_mode
+
+ def forward(self, x):
+ x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
+ return self.conv(x)
+
+class UpsampleCausal3D(nn.Module):
+ """A 3D upsampling layer with an optional convolution.
+
+ Parameters:
+ channels (`int`):
+ number of channels in the inputs and outputs.
+ use_conv (`bool`, default `False`):
+ option to use a convolution.
+ use_conv_transpose (`bool`, default `False`):
+ option to use a convolution transpose.
+ out_channels (`int`, optional):
+ number of output channels. Defaults to `channels`.
+ name (`str`, default `conv`):
+ name of the upsampling 3D layer.
+ """
+
+ def __init__(
+ self,
+ channels: int,
+ use_conv: bool = False,
+ use_conv_transpose: bool = False,
+ out_channels: Optional[int] = None,
+ name: str = "conv",
+ kernel_size: Optional[int] = None,
+ padding=1,
+ norm_type=None,
+ eps=None,
+ elementwise_affine=None,
+ bias=True,
+ interpolate=True,
+ upsample_factor=(2, 2, 2),
+ disable_causal=False,
+ ):
+ super().__init__()
+ self.channels = channels
+ self.out_channels = out_channels or channels
+ self.use_conv = use_conv
+ self.use_conv_transpose = use_conv_transpose
+ self.name = name
+ self.interpolate = interpolate
+ self.upsample_factor = upsample_factor
+ self.disable_causal = disable_causal
+
+ if norm_type == "ln_norm":
+ self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+ elif norm_type == "rms_norm":
+ self.norm = RMSNorm(channels, eps, elementwise_affine)
+ elif norm_type is None:
+ self.norm = None
+ else:
+ raise ValueError(f"unknown norm_type: {norm_type}")
+
+ conv = None
+ if use_conv_transpose:
+ assert False, "Not Implement yet"
+ if kernel_size is None:
+ kernel_size = 4
+ conv = nn.ConvTranspose2d(
+ channels, self.out_channels, kernel_size=kernel_size, stride=2, padding=padding, bias=bias
+ )
+ elif use_conv:
+ if kernel_size is None:
+ kernel_size = 3
+ conv = CausalConv3d(self.channels, self.out_channels, kernel_size=kernel_size, bias=bias, disable_causal=disable_causal)
+
+ if name == "conv":
+ self.conv = conv
+ else:
+ self.Conv2d_0 = conv
+
+ def forward(
+ self,
+ hidden_states: torch.FloatTensor,
+ output_size: Optional[int] = None,
+ scale: float = 1.0,
+ ) -> torch.FloatTensor:
+ assert hidden_states.shape[1] == self.channels
+
+ if self.norm is not None:
+ assert False, "Not Implement yet"
+ hidden_states = self.norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+
+ if self.use_conv_transpose:
+ return self.conv(hidden_states)
+
+ # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+ # https://github.com/pytorch/pytorch/issues/86679
+ dtype = hidden_states.dtype
+ if dtype == torch.bfloat16:
+ hidden_states = hidden_states.to(torch.float32)
+
+ # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+ if hidden_states.shape[0] >= 64:
+ hidden_states = hidden_states.contiguous()
+
+ # if `output_size` is passed we force the interpolation output
+ # size and do not make use of `scale_factor=2`
+ if self.interpolate:
+ B, C, T, H, W = hidden_states.shape
+ if not self.disable_causal:
+ first_h, other_h = hidden_states.split((1, T-1), dim=2)
+ if output_size is None:
+ if T > 1:
+ other_h = F.interpolate(other_h, scale_factor=self.upsample_factor, mode="nearest")
+
+ first_h = first_h.squeeze(2)
+ first_h = F.interpolate(first_h, scale_factor=self.upsample_factor[1:], mode="nearest")
+ first_h = first_h.unsqueeze(2)
+ else:
+ assert False, "Not Implement yet"
+ other_h = F.interpolate(other_h, size=output_size, mode="nearest")
+
+ if T > 1:
+ hidden_states = torch.cat((first_h, other_h), dim=2)
+ else:
+ hidden_states = first_h
+ else:
+ hidden_states = F.interpolate(hidden_states, scale_factor=self.upsample_factor, mode="nearest")
+
+ if dtype == torch.bfloat16:
+ hidden_states = hidden_states.to(dtype)
+
+ if self.use_conv:
+ if self.name == "conv":
+ hidden_states = self.conv(hidden_states)
+ else:
+ hidden_states = self.Conv2d_0(hidden_states)
+
+ return hidden_states
+
+class DownsampleCausal3D(nn.Module):
+ """A 3D downsampling layer with an optional convolution.
+
+ Parameters:
+ channels (`int`):
+ number of channels in the inputs and outputs.
+ use_conv (`bool`, default `False`):
+ option to use a convolution.
+ out_channels (`int`, optional):
+ number of output channels. Defaults to `channels`.
+ padding (`int`, default `1`):
+ padding for the convolution.
+ name (`str`, default `conv`):
+ name of the downsampling 3D layer.
+ """
+
+ def __init__(
+ self,
+ channels: int,
+ use_conv: bool = False,
+ out_channels: Optional[int] = None,
+ padding: int = 1,
+ name: str = "conv",
+ kernel_size=3,
+ norm_type=None,
+ eps=None,
+ elementwise_affine=None,
+ bias=True,
+ stride=2,
+ disable_causal=False,
+ ):
+ super().__init__()
+ self.channels = channels
+ self.out_channels = out_channels or channels
+ self.use_conv = use_conv
+ self.padding = padding
+ stride = stride
+ self.name = name
+
+ if norm_type == "ln_norm":
+ self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+ elif norm_type == "rms_norm":
+ self.norm = RMSNorm(channels, eps, elementwise_affine)
+ elif norm_type is None:
+ self.norm = None
+ else:
+ raise ValueError(f"unknown norm_type: {norm_type}")
+
+ if use_conv:
+ conv = CausalConv3d(
+ self.channels, self.out_channels, kernel_size=kernel_size, stride=stride, disable_causal=disable_causal, bias=bias
+ )
+ else:
+ raise NotImplementedError
+ if name == "conv":
+ self.Conv2d_0 = conv
+ self.conv = conv
+ elif name == "Conv2d_0":
+ self.conv = conv
+ else:
+ self.conv = conv
+
+ def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
+ assert hidden_states.shape[1] == self.channels
+
+ if self.norm is not None:
+ hidden_states = self.norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+
+ assert hidden_states.shape[1] == self.channels
+
+ hidden_states = self.conv(hidden_states)
+
+ return hidden_states
+
+class ResnetBlockCausal3D(nn.Module):
+ r"""
+ A Resnet block.
+
+ Parameters:
+ in_channels (`int`): The number of channels in the input.
+ out_channels (`int`, *optional*, default to be `None`):
+ The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+ dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+ temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+ groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+ groups_out (`int`, *optional*, default to None):
+ The number of groups to use for the second normalization layer. if set to None, same as `groups`.
+ eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+ non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
+ time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
+ By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" or
+ "ada_group" for a stronger conditioning with scale and shift.
+ kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see
+ [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
+ output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
+ use_in_shortcut (`bool`, *optional*, default to `True`):
+ If `True`, add a 1x1 nn.conv2d layer for skip-connection.
+ up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
+ down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
+ conv_shortcut_bias (`bool`, *optional*, default to `True`): If `True`, adds a learnable bias to the
+ `conv_shortcut` output.
+ conv_3d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
+ If None, same as `out_channels`.
+ """
+
+ def __init__(
+ self,
+ *,
+ in_channels: int,
+ out_channels: Optional[int] = None,
+ conv_shortcut: bool = False,
+ dropout: float = 0.0,
+ temb_channels: int = 512,
+ groups: int = 32,
+ groups_out: Optional[int] = None,
+ pre_norm: bool = True,
+ eps: float = 1e-6,
+ non_linearity: str = "swish",
+ skip_time_act: bool = False,
+ time_embedding_norm: str = "default", # default, scale_shift, ada_group, spatial
+ kernel: Optional[torch.FloatTensor] = None,
+ output_scale_factor: float = 1.0,
+ use_in_shortcut: Optional[bool] = None,
+ up: bool = False,
+ down: bool = False,
+ conv_shortcut_bias: bool = True,
+ conv_3d_out_channels: Optional[int] = None,
+ disable_causal: bool = False,
+ ):
+ super().__init__()
+ self.pre_norm = pre_norm
+ self.pre_norm = True
+ self.in_channels = in_channels
+ out_channels = in_channels if out_channels is None else out_channels
+ self.out_channels = out_channels
+ self.use_conv_shortcut = conv_shortcut
+ self.up = up
+ self.down = down
+ self.output_scale_factor = output_scale_factor
+ self.time_embedding_norm = time_embedding_norm
+ self.skip_time_act = skip_time_act
+
+ linear_cls = nn.Linear
+
+ if groups_out is None:
+ groups_out = groups
+
+ if self.time_embedding_norm == "ada_group":
+ self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
+ elif self.time_embedding_norm == "spatial":
+ self.norm1 = SpatialNorm(in_channels, temb_channels)
+ else:
+ self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+
+ self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, stride=1, disable_causal=disable_causal)
+
+ if temb_channels is not None:
+ if self.time_embedding_norm == "default":
+ self.time_emb_proj = linear_cls(temb_channels, out_channels)
+ elif self.time_embedding_norm == "scale_shift":
+ self.time_emb_proj = linear_cls(temb_channels, 2 * out_channels)
+ elif self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+ self.time_emb_proj = None
+ else:
+ raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+ else:
+ self.time_emb_proj = None
+
+ if self.time_embedding_norm == "ada_group":
+ self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
+ elif self.time_embedding_norm == "spatial":
+ self.norm2 = SpatialNorm(out_channels, temb_channels)
+ else:
+ self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+
+ self.dropout = torch.nn.Dropout(dropout)
+ conv_3d_out_channels = conv_3d_out_channels or out_channels
+ self.conv2 = CausalConv3d(out_channels, conv_3d_out_channels, kernel_size=3, stride=1, disable_causal=disable_causal)
+
+ self.nonlinearity = get_activation(non_linearity)
+
+ self.upsample = self.downsample = None
+ if self.up:
+ self.upsample = UpsampleCausal3D(in_channels, use_conv=False, disable_causal=disable_causal)
+ elif self.down:
+ self.downsample = DownsampleCausal3D(in_channels, use_conv=False, disable_causal=disable_causal, name="op")
+
+ self.use_in_shortcut = self.in_channels != conv_3d_out_channels if use_in_shortcut is None else use_in_shortcut
+
+ self.conv_shortcut = None
+ if self.use_in_shortcut:
+ self.conv_shortcut = CausalConv3d(
+ in_channels,
+ conv_3d_out_channels,
+ kernel_size=1,
+ stride=1,
+ disable_causal=disable_causal,
+ bias=conv_shortcut_bias,
+ )
+
+ def forward(
+ self,
+ input_tensor: torch.FloatTensor,
+ temb: torch.FloatTensor,
+ scale: float = 1.0,
+ ) -> torch.FloatTensor:
+ hidden_states = input_tensor
+
+ if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+ hidden_states = self.norm1(hidden_states, temb)
+ else:
+ hidden_states = self.norm1(hidden_states)
+
+ hidden_states = self.nonlinearity(hidden_states)
+
+ if self.upsample is not None:
+ # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+ if hidden_states.shape[0] >= 64:
+ input_tensor = input_tensor.contiguous()
+ hidden_states = hidden_states.contiguous()
+ input_tensor = (
+ self.upsample(input_tensor, scale=scale)
+ )
+ hidden_states = (
+ self.upsample(hidden_states, scale=scale)
+ )
+ elif self.downsample is not None:
+ input_tensor = (
+ self.downsample(input_tensor, scale=scale)
+ )
+ hidden_states = (
+ self.downsample(hidden_states, scale=scale)
+ )
+
+ hidden_states = self.conv1(hidden_states)
+
+ if self.time_emb_proj is not None:
+ if not self.skip_time_act:
+ temb = self.nonlinearity(temb)
+ temb = (
+ self.time_emb_proj(temb, scale)[:, :, None, None]
+ )
+
+ if temb is not None and self.time_embedding_norm == "default":
+ hidden_states = hidden_states + temb
+
+ if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+ hidden_states = self.norm2(hidden_states, temb)
+ else:
+ hidden_states = self.norm2(hidden_states)
+
+ if temb is not None and self.time_embedding_norm == "scale_shift":
+ scale, shift = torch.chunk(temb, 2, dim=1)
+ hidden_states = hidden_states * (1 + scale) + shift
+
+ hidden_states = self.nonlinearity(hidden_states)
+
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.conv2(hidden_states)
+
+ if self.conv_shortcut is not None:
+ input_tensor = (
+ self.conv_shortcut(input_tensor)
+ )
+
+ output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+
+ return output_tensor
+
+def get_down_block3d(
+ down_block_type: str,
+ num_layers: int,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ add_downsample: bool,
+ downsample_stride: int,
+ resnet_eps: float,
+ resnet_act_fn: str,
+ transformer_layers_per_block: int = 1,
+ num_attention_heads: Optional[int] = None,
+ resnet_groups: Optional[int] = None,
+ cross_attention_dim: Optional[int] = None,
+ downsample_padding: Optional[int] = None,
+ dual_cross_attention: bool = False,
+ use_linear_projection: bool = False,
+ only_cross_attention: bool = False,
+ upcast_attention: bool = False,
+ resnet_time_scale_shift: str = "default",
+ attention_type: str = "default",
+ resnet_skip_time_act: bool = False,
+ resnet_out_scale_factor: float = 1.0,
+ cross_attention_norm: Optional[str] = None,
+ attention_head_dim: Optional[int] = None,
+ downsample_type: Optional[str] = None,
+ dropout: float = 0.0,
+ disable_causal: bool = False,
+):
+ # If attn head dim is not defined, we default it to the number of heads
+ if attention_head_dim is None:
+ logger.warn(
+ f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+ )
+ attention_head_dim = num_attention_heads
+
+ down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+ if down_block_type == "DownEncoderBlockCausal3D":
+ return DownEncoderBlockCausal3D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ dropout=dropout,
+ add_downsample=add_downsample,
+ downsample_stride=downsample_stride,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ downsample_padding=downsample_padding,
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ disable_causal=disable_causal,
+ )
+ raise ValueError(f"{down_block_type} does not exist.")
+
+def get_up_block3d(
+ up_block_type: str,
+ num_layers: int,
+ in_channels: int,
+ out_channels: int,
+ prev_output_channel: int,
+ temb_channels: int,
+ add_upsample: bool,
+ upsample_scale_factor: Tuple,
+ resnet_eps: float,
+ resnet_act_fn: str,
+ resolution_idx: Optional[int] = None,
+ transformer_layers_per_block: int = 1,
+ num_attention_heads: Optional[int] = None,
+ resnet_groups: Optional[int] = None,
+ cross_attention_dim: Optional[int] = None,
+ dual_cross_attention: bool = False,
+ use_linear_projection: bool = False,
+ only_cross_attention: bool = False,
+ upcast_attention: bool = False,
+ resnet_time_scale_shift: str = "default",
+ attention_type: str = "default",
+ resnet_skip_time_act: bool = False,
+ resnet_out_scale_factor: float = 1.0,
+ cross_attention_norm: Optional[str] = None,
+ attention_head_dim: Optional[int] = None,
+ upsample_type: Optional[str] = None,
+ dropout: float = 0.0,
+ disable_causal: bool = False,
+) -> nn.Module:
+ # If attn head dim is not defined, we default it to the number of heads
+ if attention_head_dim is None:
+ logger.warn(
+ f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+ )
+ attention_head_dim = num_attention_heads
+
+ up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+ if up_block_type == "UpDecoderBlockCausal3D":
+ return UpDecoderBlockCausal3D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ resolution_idx=resolution_idx,
+ dropout=dropout,
+ add_upsample=add_upsample,
+ upsample_scale_factor=upsample_scale_factor,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ resnet_time_scale_shift=resnet_time_scale_shift,
+ temb_channels=temb_channels,
+ disable_causal=disable_causal,
+ )
+ raise ValueError(f"{up_block_type} does not exist.")
+
+
+class UNetMidBlockCausal3D(nn.Module):
+ """
+ A 3D UNet mid-block [`UNetMidBlockCausal3D`] with multiple residual blocks and optional attention blocks.
+
+ Args:
+ in_channels (`int`): The number of input channels.
+ temb_channels (`int`): The number of temporal embedding channels.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+ num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+ resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+ resnet_time_scale_shift (`str`, *optional*, defaults to `default`):
+ The type of normalization to apply to the time embeddings. This can help to improve the performance of the
+ model on tasks with long-range temporal dependencies.
+ resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks.
+ resnet_groups (`int`, *optional*, defaults to 32):
+ The number of groups to use in the group normalization layers of the resnet blocks.
+ attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks.
+ resnet_pre_norm (`bool`, *optional*, defaults to `True`):
+ Whether to use pre-normalization for the resnet blocks.
+ add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks.
+ attention_head_dim (`int`, *optional*, defaults to 1):
+ Dimension of a single attention head. The number of attention heads is determined based on this value and
+ the number of input channels.
+ output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor.
+
+ Returns:
+ `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+ in_channels, height, width)`.
+
+ """
+
+ def __init__(
+ self,
+ in_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default", # default, spatial
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ attn_groups: Optional[int] = None,
+ resnet_pre_norm: bool = True,
+ add_attention: bool = True,
+ attention_head_dim: int = 1,
+ output_scale_factor: float = 1.0,
+ disable_causal: bool = False,
+ causal_attention: bool = False,
+ ):
+ super().__init__()
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+ self.add_attention = add_attention
+ self.causal_attention = causal_attention
+
+ if attn_groups is None:
+ attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
+
+ # there is always at least one resnet
+ resnets = [
+ ResnetBlockCausal3D(
+ in_channels=in_channels,
+ out_channels=in_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ disable_causal=disable_causal,
+ )
+ ]
+ attentions = []
+
+ if attention_head_dim is None:
+ logger.warn(
+ f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
+ )
+ attention_head_dim = in_channels
+
+ for _ in range(num_layers):
+ if self.add_attention:
+ #assert False, "Not implemented yet"
+ attentions.append(
+ Attention(
+ in_channels,
+ heads=in_channels // attention_head_dim,
+ dim_head=attention_head_dim,
+ rescale_output_factor=output_scale_factor,
+ eps=resnet_eps,
+ norm_num_groups=attn_groups,
+ spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+ residual_connection=True,
+ bias=True,
+ upcast_softmax=True,
+ _from_deprecated_attn_block=True,
+ )
+ )
+ else:
+ attentions.append(None)
+
+ resnets.append(
+ ResnetBlockCausal3D(
+ in_channels=in_channels,
+ out_channels=in_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ disable_causal=disable_causal,
+ )
+ )
+
+ self.attentions = nn.ModuleList(attentions)
+ self.resnets = nn.ModuleList(resnets)
+
+ def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+ hidden_states = self.resnets[0](hidden_states, temb)
+ for attn, resnet in zip(self.attentions, self.resnets[1:]):
+ if attn is not None:
+ B, C, T, H, W = hidden_states.shape
+ hidden_states = rearrange(hidden_states, "b c f h w -> b (f h w) c")
+ if self.causal_attention:
+ attention_mask = prepare_causal_attention_mask(T, H * W, hidden_states.dtype, hidden_states.device, batch_size=B)
+ else:
+ attention_mask = None
+ hidden_states = attn(hidden_states, temb=temb, attention_mask=attention_mask)
+ hidden_states = rearrange(hidden_states, "b (f h w) c -> b c f h w", f=T, h=H, w=W)
+ hidden_states = resnet(hidden_states, temb)
+
+ return hidden_states
+
+
+class DownEncoderBlockCausal3D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor: float = 1.0,
+ add_downsample: bool = True,
+ downsample_stride: int = 2,
+ downsample_padding: int = 1,
+ disable_causal: bool = False,
+ ):
+ super().__init__()
+ resnets = []
+
+ for i in range(num_layers):
+ in_channels = in_channels if i == 0 else out_channels
+ resnets.append(
+ ResnetBlockCausal3D(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=None,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ disable_causal=disable_causal,
+ )
+ )
+
+ self.resnets = nn.ModuleList(resnets)
+
+ if add_downsample:
+ self.downsamplers = nn.ModuleList(
+ [
+ DownsampleCausal3D(
+ out_channels,
+ use_conv=True,
+ out_channels=out_channels,
+ padding=downsample_padding,
+ name="op",
+ stride=downsample_stride,
+ disable_causal=disable_causal,
+ )
+ ]
+ )
+ else:
+ self.downsamplers = None
+
+ def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
+ for resnet in self.resnets:
+ hidden_states = resnet(hidden_states, temb=None, scale=scale)
+
+ if self.downsamplers is not None:
+ for downsampler in self.downsamplers:
+ hidden_states = downsampler(hidden_states, scale)
+
+ return hidden_states
+
+
+class UpDecoderBlockCausal3D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ resolution_idx: Optional[int] = None,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default", # default, spatial
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor: float = 1.0,
+ add_upsample: bool = True,
+ upsample_scale_factor = (2, 2, 2),
+ temb_channels: Optional[int] = None,
+ disable_causal: bool = False,
+ ):
+ super().__init__()
+ resnets = []
+
+ for i in range(num_layers):
+ input_channels = in_channels if i == 0 else out_channels
+
+ resnets.append(
+ ResnetBlockCausal3D(
+ in_channels=input_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ disable_causal=disable_causal,
+ )
+ )
+
+ self.resnets = nn.ModuleList(resnets)
+
+ if add_upsample:
+ self.upsamplers = nn.ModuleList(
+ [
+ UpsampleCausal3D(
+ out_channels,
+ use_conv=True,
+ out_channels=out_channels,
+ upsample_factor=upsample_scale_factor,
+ disable_causal=disable_causal
+ )
+ ]
+ )
+ else:
+ self.upsamplers = None
+
+ self.resolution_idx = resolution_idx
+
+ def forward(
+ self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+ ) -> torch.FloatTensor:
+ for resnet in self.resnets:
+ hidden_states = resnet(hidden_states, temb=temb, scale=scale)
+
+ if self.upsamplers is not None:
+ for upsampler in self.upsamplers:
+ hidden_states = upsampler(hidden_states)
+
+ return hidden_states
+
diff --git a/hyvideo/vae/vae.py b/hyvideo/vae/vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7198a30bb3b5aaa283579cdf4e287f2906de2e8
--- /dev/null
+++ b/hyvideo/vae/vae.py
@@ -0,0 +1,427 @@
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from diffusers.utils import BaseOutput, is_torch_version
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.models.attention_processor import SpatialNorm
+from .unet_causal_3d_blocks import (
+ CausalConv3d,
+ UNetMidBlockCausal3D,
+ get_down_block3d,
+ get_up_block3d,
+)
+
+@dataclass
+class DecoderOutput(BaseOutput):
+ r"""
+ Output of decoding method.
+
+ Args:
+ sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ The decoded output sample from the last layer of the model.
+ """
+
+ sample: torch.FloatTensor
+
+
+class EncoderCausal3D(nn.Module):
+ r"""
+ The `EncoderCausal3D` layer of a variational autoencoder that encodes its input into a latent representation.
+
+ Args:
+ in_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ out_channels (`int`, *optional*, defaults to 3):
+ The number of output channels.
+ down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+ The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
+ options.
+ block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+ The number of output channels for each block.
+ layers_per_block (`int`, *optional*, defaults to 2):
+ The number of layers per block.
+ norm_num_groups (`int`, *optional*, defaults to 32):
+ The number of groups for normalization.
+ act_fn (`str`, *optional*, defaults to `"silu"`):
+ The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+ double_z (`bool`, *optional*, defaults to `True`):
+ Whether to double the number of output channels for the last block.
+ """
+
+ def __init__(
+ self,
+ in_channels: int = 3,
+ out_channels: int = 3,
+ down_block_types: Tuple[str, ...] = ("DownEncoderBlockCausal3D",),
+ block_out_channels: Tuple[int, ...] = (64,),
+ layers_per_block: int = 2,
+ norm_num_groups: int = 32,
+ act_fn: str = "silu",
+ double_z: bool = True,
+ mid_block_add_attention=True,
+ time_compression_ratio: int = 4,
+ spatial_compression_ratio: int = 8,
+ disable_causal: bool = False,
+ mid_block_causal_attn: bool = False,
+ ):
+ super().__init__()
+ self.layers_per_block = layers_per_block
+
+ self.conv_in = CausalConv3d(in_channels, block_out_channels[0], kernel_size=3, stride=1, disable_causal=disable_causal)
+ self.mid_block = None
+ self.down_blocks = nn.ModuleList([])
+
+ # down
+ output_channel = block_out_channels[0]
+ for i, down_block_type in enumerate(down_block_types):
+ input_channel = output_channel
+ output_channel = block_out_channels[i]
+ is_final_block = i == len(block_out_channels) - 1
+ num_spatial_downsample_layers = int(np.log2(spatial_compression_ratio))
+ num_time_downsample_layers = int(np.log2(time_compression_ratio))
+
+ if time_compression_ratio == 4:
+ add_spatial_downsample = bool(i < num_spatial_downsample_layers)
+ add_time_downsample = bool(i >= (len(block_out_channels) - 1 - num_time_downsample_layers) and not is_final_block)
+ elif time_compression_ratio == 8:
+ add_spatial_downsample = bool(i < num_spatial_downsample_layers)
+ add_time_downsample = bool(i < num_time_downsample_layers)
+ else:
+ raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}")
+
+ downsample_stride_HW = (2, 2) if add_spatial_downsample else (1, 1)
+ downsample_stride_T = (2, ) if add_time_downsample else (1, )
+ downsample_stride = tuple(downsample_stride_T + downsample_stride_HW)
+ down_block = get_down_block3d(
+ down_block_type,
+ num_layers=self.layers_per_block,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ add_downsample=bool(add_spatial_downsample or add_time_downsample),
+ downsample_stride=downsample_stride,
+ resnet_eps=1e-6,
+ downsample_padding=0,
+ resnet_act_fn=act_fn,
+ resnet_groups=norm_num_groups,
+ attention_head_dim=output_channel,
+ temb_channels=None,
+ disable_causal=disable_causal,
+ )
+ self.down_blocks.append(down_block)
+
+ # mid
+ self.mid_block = UNetMidBlockCausal3D(
+ in_channels=block_out_channels[-1],
+ resnet_eps=1e-6,
+ resnet_act_fn=act_fn,
+ output_scale_factor=1,
+ resnet_time_scale_shift="default",
+ attention_head_dim=block_out_channels[-1],
+ resnet_groups=norm_num_groups,
+ temb_channels=None,
+ add_attention=mid_block_add_attention,
+ disable_causal=disable_causal,
+ causal_attention=mid_block_causal_attn,
+ )
+
+ # out
+ self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+ self.conv_act = nn.SiLU()
+
+ conv_out_channels = 2 * out_channels if double_z else out_channels
+ self.conv_out = CausalConv3d(block_out_channels[-1], conv_out_channels, kernel_size=3, disable_causal=disable_causal)
+
+ self.gradient_checkpointing = False
+
+ def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+ r"""The forward method of the `EncoderCausal3D` class."""
+ assert len(sample.shape) == 5, "The input tensor should have 5 dimensions"
+
+ sample = self.conv_in(sample)
+
+ if self.training and self.gradient_checkpointing:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(*inputs)
+
+ return custom_forward
+
+ # down
+ if is_torch_version(">=", "1.11.0"):
+ for down_block in self.down_blocks:
+ sample = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(down_block), sample, use_reentrant=False
+ )
+ # middle
+ sample = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(self.mid_block), sample, use_reentrant=False
+ )
+ else:
+ for down_block in self.down_blocks:
+ sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample)
+ # middle
+ sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+
+ else:
+ # down
+ for down_block in self.down_blocks:
+ sample = down_block(sample)
+
+ # middle
+ sample = self.mid_block(sample)
+
+ # post-process
+ sample = self.conv_norm_out(sample)
+ sample = self.conv_act(sample)
+ sample = self.conv_out(sample)
+
+ return sample
+
+
+class DecoderCausal3D(nn.Module):
+ r"""
+ The `DecoderCausal3D` layer of a variational autoencoder that decodes its latent representation into an output sample.
+
+ Args:
+ in_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ out_channels (`int`, *optional*, defaults to 3):
+ The number of output channels.
+ up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+ The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
+ block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+ The number of output channels for each block.
+ layers_per_block (`int`, *optional*, defaults to 2):
+ The number of layers per block.
+ norm_num_groups (`int`, *optional*, defaults to 32):
+ The number of groups for normalization.
+ act_fn (`str`, *optional*, defaults to `"silu"`):
+ The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+ norm_type (`str`, *optional*, defaults to `"group"`):
+ The normalization type to use. Can be either `"group"` or `"spatial"`.
+ """
+
+ def __init__(
+ self,
+ in_channels: int = 3,
+ out_channels: int = 3,
+ up_block_types: Tuple[str, ...] = ("UpDecoderBlockCausal3D",),
+ block_out_channels: Tuple[int, ...] = (64,),
+ layers_per_block: int = 2,
+ norm_num_groups: int = 32,
+ act_fn: str = "silu",
+ norm_type: str = "group", # group, spatial
+ mid_block_add_attention=True,
+ time_compression_ratio: int = 4,
+ spatial_compression_ratio: int = 8,
+ disable_causal: bool = False,
+ mid_block_causal_attn: bool = False,
+ ):
+ super().__init__()
+ self.layers_per_block = layers_per_block
+
+ self.conv_in = CausalConv3d(in_channels, block_out_channels[-1], kernel_size=3, stride=1, disable_causal=disable_causal)
+ self.mid_block = None
+ self.up_blocks = nn.ModuleList([])
+
+ temb_channels = in_channels if norm_type == "spatial" else None
+
+ # mid
+ self.mid_block = UNetMidBlockCausal3D(
+ in_channels=block_out_channels[-1],
+ resnet_eps=1e-6,
+ resnet_act_fn=act_fn,
+ output_scale_factor=1,
+ resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
+ attention_head_dim=block_out_channels[-1],
+ resnet_groups=norm_num_groups,
+ temb_channels=temb_channels,
+ add_attention=mid_block_add_attention,
+ disable_causal=disable_causal,
+ causal_attention=mid_block_causal_attn,
+ )
+
+ # up
+ reversed_block_out_channels = list(reversed(block_out_channels))
+ output_channel = reversed_block_out_channels[0]
+ for i, up_block_type in enumerate(up_block_types):
+ prev_output_channel = output_channel
+ output_channel = reversed_block_out_channels[i]
+ is_final_block = i == len(block_out_channels) - 1
+ num_spatial_upsample_layers = int(np.log2(spatial_compression_ratio))
+ num_time_upsample_layers = int(np.log2(time_compression_ratio))
+
+ if time_compression_ratio == 4:
+ add_spatial_upsample = bool(i < num_spatial_upsample_layers)
+ add_time_upsample = bool(i >= len(block_out_channels) - 1 - num_time_upsample_layers and not is_final_block)
+ elif time_compression_ratio == 8:
+ add_spatial_upsample = bool(i >= len(block_out_channels) - num_spatial_upsample_layers)
+ add_time_upsample = bool(i >= len(block_out_channels) - num_time_upsample_layers)
+ else:
+ raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}")
+
+ upsample_scale_factor_HW = (2, 2) if add_spatial_upsample else (1, 1)
+ upsample_scale_factor_T = (2, ) if add_time_upsample else (1, )
+ upsample_scale_factor = tuple(upsample_scale_factor_T + upsample_scale_factor_HW)
+ up_block = get_up_block3d(
+ up_block_type,
+ num_layers=self.layers_per_block + 1,
+ in_channels=prev_output_channel,
+ out_channels=output_channel,
+ prev_output_channel=None,
+ add_upsample=bool(add_spatial_upsample or add_time_upsample),
+ upsample_scale_factor=upsample_scale_factor,
+ resnet_eps=1e-6,
+ resnet_act_fn=act_fn,
+ resnet_groups=norm_num_groups,
+ attention_head_dim=output_channel,
+ temb_channels=temb_channels,
+ resnet_time_scale_shift=norm_type,
+ disable_causal=disable_causal,
+ )
+ self.up_blocks.append(up_block)
+ prev_output_channel = output_channel
+
+ # out
+ if norm_type == "spatial":
+ self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
+ else:
+ self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+ self.conv_act = nn.SiLU()
+ self.conv_out = CausalConv3d(block_out_channels[0], out_channels, kernel_size=3, disable_causal=disable_causal)
+
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ sample: torch.FloatTensor,
+ latent_embeds: Optional[torch.FloatTensor] = None,
+ ) -> torch.FloatTensor:
+ r"""The forward method of the `DecoderCausal3D` class."""
+ assert len(sample.shape) == 5, "The input tensor should have 5 dimensions"
+
+ sample = self.conv_in(sample)
+
+ upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+ if self.training and self.gradient_checkpointing:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(*inputs)
+
+ return custom_forward
+
+ if is_torch_version(">=", "1.11.0"):
+ # middle
+ sample = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(self.mid_block),
+ sample,
+ latent_embeds,
+ use_reentrant=False,
+ )
+ sample = sample.to(upscale_dtype)
+
+ # up
+ for up_block in self.up_blocks:
+ sample = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(up_block),
+ sample,
+ latent_embeds,
+ use_reentrant=False,
+ )
+ else:
+ # middle
+ sample = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(self.mid_block), sample, latent_embeds
+ )
+ sample = sample.to(upscale_dtype)
+
+ # up
+ for up_block in self.up_blocks:
+ sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+ else:
+ # middle
+ sample = self.mid_block(sample, latent_embeds)
+ sample = sample.to(upscale_dtype)
+
+ # up
+ for up_block in self.up_blocks:
+ sample = up_block(sample, latent_embeds)
+
+ # post-process
+ if latent_embeds is None:
+ sample = self.conv_norm_out(sample)
+ else:
+ sample = self.conv_norm_out(sample, latent_embeds)
+ sample = self.conv_act(sample)
+ sample = self.conv_out(sample)
+
+ return sample
+
+
+class DiagonalGaussianDistribution(object):
+ def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+ if parameters.ndim == 3:
+ dim = 2 # (B, L, C)
+ elif parameters.ndim == 5 or parameters.ndim == 4:
+ dim = 1 # (B, C, T, H ,W) / (B, C, H, W)
+ else:
+ raise NotImplementedError
+ self.parameters = parameters
+ self.mean, self.logvar = torch.chunk(parameters, 2, dim=dim)
+ self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+ self.deterministic = deterministic
+ self.std = torch.exp(0.5 * self.logvar)
+ self.var = torch.exp(self.logvar)
+ if self.deterministic:
+ self.var = self.std = torch.zeros_like(
+ self.mean, device=self.parameters.device, dtype=self.parameters.dtype
+ )
+
+ def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
+ # make sure sample is on the same device as the parameters and has same dtype
+ sample = randn_tensor(
+ self.mean.shape,
+ generator=generator,
+ device=self.parameters.device,
+ dtype=self.parameters.dtype,
+ )
+ x = self.mean + self.std * sample
+ return x
+
+ def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
+ if self.deterministic:
+ return torch.Tensor([0.0])
+ else:
+ reduce_dim = list(range(1, self.mean.ndim))
+ if other is None:
+ return 0.5 * torch.sum(
+ torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+ dim=reduce_dim,
+ )
+ else:
+ return 0.5 * torch.sum(
+ torch.pow(self.mean - other.mean, 2) / other.var
+ + self.var / other.var
+ - 1.0
+ - self.logvar
+ + other.logvar,
+ dim=reduce_dim,
+ )
+
+ def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+ if self.deterministic:
+ return torch.Tensor([0.0])
+ logtwopi = np.log(2.0 * np.pi)
+ return 0.5 * torch.sum(
+ logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+ dim=dims,
+ )
+
+ def mode(self) -> torch.Tensor:
+ return self.mean
diff --git a/loras_hunyuan/Readme.txt b/loras_hunyuan/Readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5eb63a3e537e1e30759d2c11f4727b41e55d79f3
--- /dev/null
+++ b/loras_hunyuan/Readme.txt
@@ -0,0 +1 @@
+loras for hunyuan t2v
\ No newline at end of file
diff --git a/loras_hunyuan_i2v/Readme.txt b/loras_hunyuan_i2v/Readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8845c855cfff632db5c89c53baa684ab6564b000
--- /dev/null
+++ b/loras_hunyuan_i2v/Readme.txt
@@ -0,0 +1 @@
+loras for hunyuan i2v
\ No newline at end of file
diff --git a/ltx_video/__init__.py b/ltx_video/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ltx_video/configs/ltxv-13b-0.9.7-dev.original.yaml b/ltx_video/configs/ltxv-13b-0.9.7-dev.original.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97b3436e3c5d5b4376bc483e986d0af8c4483996
--- /dev/null
+++ b/ltx_video/configs/ltxv-13b-0.9.7-dev.original.yaml
@@ -0,0 +1,41 @@
+
+pipeline_type: multi-scale
+checkpoint_path: "ltxv-13b-0.9.7-dev.safetensors"
+downscale_factor: 0.6666666
+spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.7.safetensors"
+stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
+sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
+prompt_enhancement_words_threshold: 120
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+stochastic_sampling: false
+
+
+first_pass:
+ #13b Dynamic
+ guidance_scale: [1, 6, 8, 6, 1, 1]
+ stg_scale: [0, 4, 4, 4, 2, 1]
+ rescaling_scale: [1, 0.5, 0.5, 1, 1, 1]
+ guidance_timesteps: [1.0, 0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
+ skip_block_list: [[11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]
+ num_inference_steps: 30 #default
+
+
+second_pass:
+ #13b Dynamic
+ guidance_scale: [1, 6, 8, 6, 1, 1]
+ stg_scale: [0, 4, 4, 4, 2, 1]
+ rescaling_scale: [1, 0.5, 0.5, 1, 1, 1]
+ guidance_timesteps: [1.0, 0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
+ skip_block_list: [[11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]
+ #13b Upscale
+ # guidance_scale: [1, 1, 1, 1, 1, 1]
+ # stg_scale: [1, 1, 1, 1, 1, 1]
+ # rescaling_scale: [1, 1, 1, 1, 1, 1]
+ # guidance_timesteps: [1.0, 0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
+ # skip_block_list: [[42], [42], [42], [42], [42], [42]]
+ num_inference_steps: 30 #default
+ strength: 0.85
diff --git a/ltx_video/configs/ltxv-13b-0.9.7-dev.yaml b/ltx_video/configs/ltxv-13b-0.9.7-dev.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ae548253526c1de5804bb430407850573305cd14
--- /dev/null
+++ b/ltx_video/configs/ltxv-13b-0.9.7-dev.yaml
@@ -0,0 +1,34 @@
+pipeline_type: multi-scale
+checkpoint_path: "ltxv-13b-0.9.7-dev.safetensors"
+downscale_factor: 0.6666666
+spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.7.safetensors"
+stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
+precision: "bfloat16"
+sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
+prompt_enhancement_words_threshold: 120
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+stochastic_sampling: false
+
+first_pass:
+ guidance_scale: [1, 1, 6, 8, 6, 1, 1]
+ stg_scale: [0, 0, 4, 4, 4, 2, 1]
+ rescaling_scale: [1, 1, 0.5, 0.5, 1, 1, 1]
+ guidance_timesteps: [1.0, 0.996, 0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
+ skip_block_list: [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]
+ num_inference_steps: 30
+ skip_final_inference_steps: 3
+ cfg_star_rescale: true
+
+second_pass:
+ guidance_scale: [1]
+ stg_scale: [1]
+ rescaling_scale: [1]
+ guidance_timesteps: [1.0]
+ skip_block_list: [27]
+ num_inference_steps: 30
+ skip_initial_inference_steps: 17
+ cfg_star_rescale: true
\ No newline at end of file
diff --git a/ltx_video/configs/ltxv-13b-0.9.7-distilled.yaml b/ltx_video/configs/ltxv-13b-0.9.7-distilled.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9df17bb001b39d6d12c7013cb823c44b85d28aea
--- /dev/null
+++ b/ltx_video/configs/ltxv-13b-0.9.7-distilled.yaml
@@ -0,0 +1,28 @@
+pipeline_type: multi-scale
+checkpoint_path: "ltxv-13b-0.9.7-distilled.safetensors"
+downscale_factor: 0.6666666
+spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.7.safetensors"
+stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
+precision: "bfloat16"
+sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
+prompt_enhancement_words_threshold: 120
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+stochastic_sampling: false
+
+first_pass:
+ timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250]
+ guidance_scale: 1
+ stg_scale: 0
+ rescaling_scale: 1
+ skip_block_list: [42]
+
+second_pass:
+ timesteps: [0.9094, 0.7250, 0.4219]
+ guidance_scale: 1
+ stg_scale: 0
+ rescaling_scale: 1
+ skip_block_list: [42]
diff --git a/ltx_video/configs/ltxv-2b-0.9.6-dev.yaml b/ltx_video/configs/ltxv-2b-0.9.6-dev.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..487f99708e0672dd17b5bd78424f25261163f7dc
--- /dev/null
+++ b/ltx_video/configs/ltxv-2b-0.9.6-dev.yaml
@@ -0,0 +1,17 @@
+pipeline_type: base
+checkpoint_path: "ltxv-2b-0.9.6-dev-04-25.safetensors"
+guidance_scale: 3
+stg_scale: 1
+rescaling_scale: 0.7
+skip_block_list: [19]
+num_inference_steps: 40
+stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
+precision: "bfloat16"
+sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
+prompt_enhancement_words_threshold: 120
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+stochastic_sampling: false
\ No newline at end of file
diff --git a/ltx_video/configs/ltxv-2b-0.9.6-distilled.yaml b/ltx_video/configs/ltxv-2b-0.9.6-distilled.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..39fae265425f058e3d27a846f104a01290cfade9
--- /dev/null
+++ b/ltx_video/configs/ltxv-2b-0.9.6-distilled.yaml
@@ -0,0 +1,17 @@
+pipeline_type: base
+checkpoint_path: "ltxv-2b-0.9.6-distilled-04-25.safetensors"
+guidance_scale: 3
+stg_scale: 1
+rescaling_scale: 0.7
+skip_block_list: [19]
+num_inference_steps: 8
+stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
+precision: "bfloat16"
+sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
+prompt_enhancement_words_threshold: 120
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+stochastic_sampling: true
\ No newline at end of file
diff --git a/ltx_video/ltxv.py b/ltx_video/ltxv.py
new file mode 100644
index 0000000000000000000000000000000000000000..e82c0043de74958d74c55171a231c610d3eb82cf
--- /dev/null
+++ b/ltx_video/ltxv.py
@@ -0,0 +1,562 @@
+from mmgp import offload
+import argparse
+import os
+import random
+from datetime import datetime
+from pathlib import Path
+from diffusers.utils import logging
+from typing import Optional, List, Union
+import yaml
+from wan.utils.utils import calculate_new_dimensions
+import imageio
+import json
+import numpy as np
+import torch
+from safetensors import safe_open
+from PIL import Image
+from transformers import (
+ T5EncoderModel,
+ T5Tokenizer,
+ AutoModelForCausalLM,
+ AutoProcessor,
+ AutoTokenizer,
+)
+from huggingface_hub import hf_hub_download
+
+from .models.autoencoders.causal_video_autoencoder import (
+ CausalVideoAutoencoder,
+)
+from .models.transformers.symmetric_patchifier import SymmetricPatchifier
+from .models.transformers.transformer3d import Transformer3DModel
+from .pipelines.pipeline_ltx_video import (
+ ConditioningItem,
+ LTXVideoPipeline,
+ LTXMultiScalePipeline,
+)
+from .schedulers.rf import RectifiedFlowScheduler
+from .utils.skip_layer_strategy import SkipLayerStrategy
+from .models.autoencoders.latent_upsampler import LatentUpsampler
+from .pipelines import crf_compressor
+import cv2
+
+MAX_HEIGHT = 720
+MAX_WIDTH = 1280
+MAX_NUM_FRAMES = 257
+
+logger = logging.get_logger("LTX-Video")
+
+
+def get_total_gpu_memory():
+ if torch.cuda.is_available():
+ total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+ return total_memory
+ return 0
+
+
+def get_device():
+ if torch.cuda.is_available():
+ return "cuda"
+ elif torch.backends.mps.is_available():
+ return "mps"
+ return "cpu"
+
+
+def load_image_to_tensor_with_resize_and_crop(
+ image_input: Union[str, Image.Image],
+ target_height: int = 512,
+ target_width: int = 768,
+ just_crop: bool = False,
+) -> torch.Tensor:
+ """Load and process an image into a tensor.
+
+ Args:
+ image_input: Either a file path (str) or a PIL Image object
+ target_height: Desired height of output tensor
+ target_width: Desired width of output tensor
+ just_crop: If True, only crop the image to the target size without resizing
+ """
+ if isinstance(image_input, str):
+ image = Image.open(image_input).convert("RGB")
+ elif isinstance(image_input, Image.Image):
+ image = image_input
+ else:
+ raise ValueError("image_input must be either a file path or a PIL Image object")
+
+ input_width, input_height = image.size
+ aspect_ratio_target = target_width / target_height
+ aspect_ratio_frame = input_width / input_height
+ if aspect_ratio_frame > aspect_ratio_target:
+ new_width = int(input_height * aspect_ratio_target)
+ new_height = input_height
+ x_start = (input_width - new_width) // 2
+ y_start = 0
+ else:
+ new_width = input_width
+ new_height = int(input_width / aspect_ratio_target)
+ x_start = 0
+ y_start = (input_height - new_height) // 2
+
+ image = image.crop((x_start, y_start, x_start + new_width, y_start + new_height))
+ if not just_crop:
+ image = image.resize((target_width, target_height))
+
+ image = np.array(image)
+ image = cv2.GaussianBlur(image, (3, 3), 0)
+ frame_tensor = torch.from_numpy(image).float()
+ frame_tensor = crf_compressor.compress(frame_tensor / 255.0) * 255.0
+ frame_tensor = frame_tensor.permute(2, 0, 1)
+ frame_tensor = (frame_tensor / 127.5) - 1.0
+ # Create 5D tensor: (batch_size=1, channels=3, num_frames=1, height, width)
+ return frame_tensor.unsqueeze(0).unsqueeze(2)
+
+
+
+def calculate_padding(
+ source_height: int, source_width: int, target_height: int, target_width: int
+) -> tuple[int, int, int, int]:
+
+ # Calculate total padding needed
+ pad_height = target_height - source_height
+ pad_width = target_width - source_width
+
+ # Calculate padding for each side
+ pad_top = pad_height // 2
+ pad_bottom = pad_height - pad_top # Handles odd padding
+ pad_left = pad_width // 2
+ pad_right = pad_width - pad_left # Handles odd padding
+
+ # Return padded tensor
+ # Padding format is (left, right, top, bottom)
+ padding = (pad_left, pad_right, pad_top, pad_bottom)
+ return padding
+
+
+
+
+def seed_everething(seed: int):
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ if torch.cuda.is_available():
+ torch.cuda.manual_seed(seed)
+ if torch.backends.mps.is_available():
+ torch.mps.manual_seed(seed)
+
+
+class LTXV:
+
+ def __init__(
+ self,
+ model_filepath: str,
+ text_encoder_filepath: str,
+ dtype = torch.bfloat16,
+ VAE_dtype = torch.bfloat16,
+ mixed_precision_transformer = False
+ ):
+
+ self.mixed_precision_transformer = mixed_precision_transformer
+ # ckpt_path = Path(ckpt_path)
+ # with safe_open(ckpt_path, framework="pt") as f:
+ # metadata = f.metadata()
+ # config_str = metadata.get("config")
+ # configs = json.loads(config_str)
+ # allowed_inference_steps = configs.get("allowed_inference_steps", None)
+ # transformer = Transformer3DModel.from_pretrained(ckpt_path)
+ # offload.save_model(transformer, "ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", config_file_path="config_transformer.json")
+
+ # vae = CausalVideoAutoencoder.from_pretrained(ckpt_path)
+ vae = offload.fast_load_transformers_model("ckpts/ltxv_0.9.7_VAE.safetensors", modelClass=CausalVideoAutoencoder)
+ if VAE_dtype == torch.float16:
+ VAE_dtype = torch.bfloat16
+
+ vae = vae.to(VAE_dtype)
+ vae._model_dtype = VAE_dtype
+ # vae = offload.fast_load_transformers_model("vae.safetensors", modelClass=CausalVideoAutoencoder, modelPrefix= "vae", forcedConfigPath="config_vae.json")
+ # offload.save_model(vae, "vae.safetensors", config_file_path="config_vae.json")
+
+
+ transformer = offload.fast_load_transformers_model(model_filepath, modelClass=Transformer3DModel)
+ transformer._model_dtype = dtype
+ if mixed_precision_transformer:
+ transformer._lock_dtype = torch.float
+
+
+ scheduler = RectifiedFlowScheduler.from_pretrained("ckpts/ltxv_scheduler.json")
+ # transformer = offload.fast_load_transformers_model("ltx_13B_quanto_bf16_int8.safetensors", modelClass=Transformer3DModel, modelPrefix= "model.diffusion_model", forcedConfigPath="config_transformer.json")
+ # offload.save_model(transformer, "ltx_13B_quanto_bf16_int8.safetensors", do_quantize= True, config_file_path="config_transformer.json")
+
+ latent_upsampler = LatentUpsampler.from_pretrained("ckpts/ltxv_0.9.7_spatial_upscaler.safetensors").to("cpu").eval()
+ latent_upsampler.to(VAE_dtype)
+ latent_upsampler._model_dtype = VAE_dtype
+
+ allowed_inference_steps = None
+
+ # text_encoder = T5EncoderModel.from_pretrained(
+ # "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
+ # )
+ # text_encoder.to(torch.bfloat16)
+ # offload.save_model(text_encoder, "T5_xxl_1.1_enc_bf16.safetensors", config_file_path="T5_config.json")
+ # offload.save_model(text_encoder, "T5_xxl_1.1_enc_quanto_bf16_int8.safetensors", do_quantize= True, config_file_path="T5_config.json")
+
+ text_encoder = offload.fast_load_transformers_model(text_encoder_filepath)
+ patchifier = SymmetricPatchifier(patch_size=1)
+ tokenizer = T5Tokenizer.from_pretrained( "ckpts/T5_xxl_1.1")
+
+ enhance_prompt = False
+ if enhance_prompt:
+ prompt_enhancer_image_caption_model = AutoModelForCausalLM.from_pretrained( "ckpts/Florence2", trust_remote_code=True)
+ prompt_enhancer_image_caption_processor = AutoProcessor.from_pretrained( "ckpts/Florence2", trust_remote_code=True)
+ prompt_enhancer_llm_model = offload.fast_load_transformers_model("ckpts/Llama3_2_quanto_bf16_int8.safetensors")
+ prompt_enhancer_llm_tokenizer = AutoTokenizer.from_pretrained("ckpts/Llama3_2")
+ else:
+ prompt_enhancer_image_caption_model = None
+ prompt_enhancer_image_caption_processor = None
+ prompt_enhancer_llm_model = None
+ prompt_enhancer_llm_tokenizer = None
+
+ if prompt_enhancer_image_caption_model != None:
+ pipe["prompt_enhancer_image_caption_model"] = prompt_enhancer_image_caption_model
+ prompt_enhancer_image_caption_model._model_dtype = torch.float
+
+ pipe["prompt_enhancer_llm_model"] = prompt_enhancer_llm_model
+
+ # offload.profile(pipe, profile_no=5, extraModelsToQuantize = None, quantizeTransformer = False, budgets = { "prompt_enhancer_llm_model" : 10000, "prompt_enhancer_image_caption_model" : 10000, "vae" : 3000, "*" : 100 }, verboseLevel=2)
+
+
+ # Use submodels for the pipeline
+ submodel_dict = {
+ "transformer": transformer,
+ "patchifier": patchifier,
+ "text_encoder": text_encoder,
+ "tokenizer": tokenizer,
+ "scheduler": scheduler,
+ "vae": vae,
+ "prompt_enhancer_image_caption_model": prompt_enhancer_image_caption_model,
+ "prompt_enhancer_image_caption_processor": prompt_enhancer_image_caption_processor,
+ "prompt_enhancer_llm_model": prompt_enhancer_llm_model,
+ "prompt_enhancer_llm_tokenizer": prompt_enhancer_llm_tokenizer,
+ "allowed_inference_steps": allowed_inference_steps,
+ }
+ pipeline = LTXVideoPipeline(**submodel_dict)
+ pipeline = LTXMultiScalePipeline(pipeline, latent_upsampler=latent_upsampler)
+
+ self.pipeline = pipeline
+ self.model = transformer
+ self.vae = vae
+ # return pipeline, pipe
+
+ def generate(
+ self,
+ input_prompt: str,
+ n_prompt: str,
+ image_start = None,
+ image_end = None,
+ input_video = None,
+ sampling_steps = 50,
+ image_cond_noise_scale: float = 0.15,
+ input_media_path: Optional[str] = None,
+ strength: Optional[float] = 1.0,
+ seed: int = 42,
+ height: Optional[int] = 704,
+ width: Optional[int] = 1216,
+ frame_num: int = 81,
+ frame_rate: int = 30,
+ fit_into_canvas = True,
+ callback=None,
+ device: Optional[str] = None,
+ VAE_tile_size = None,
+ **kwargs,
+ ):
+
+ num_inference_steps1 = sampling_steps
+ num_inference_steps2 = sampling_steps #10
+ conditioning_strengths = None
+ conditioning_media_paths = []
+ conditioning_start_frames = []
+
+
+ if input_video != None:
+ conditioning_media_paths.append(input_video)
+ conditioning_start_frames.append(0)
+ height, width = input_video.shape[-2:]
+ else:
+ if image_start != None:
+ image_start = image_start[0]
+ frame_width, frame_height = image_start.size
+ height, width = calculate_new_dimensions(height, width, frame_height, frame_width, fit_into_canvas, 32)
+ conditioning_media_paths.append(image_start)
+ conditioning_start_frames.append(0)
+ if image_end != None:
+ image_end = image_end[0]
+ conditioning_media_paths.append(image_end)
+ conditioning_start_frames.append(frame_num-1)
+
+ if len(conditioning_media_paths) == 0:
+ conditioning_media_paths = None
+ conditioning_start_frames = None
+
+ pipeline_config = "ltx_video/configs/ltxv-13b-0.9.7-dev.yaml"
+ # check if pipeline_config is a file
+ if not os.path.isfile(pipeline_config):
+ raise ValueError(f"Pipeline config file {pipeline_config} does not exist")
+ with open(pipeline_config, "r") as f:
+ pipeline_config = yaml.safe_load(f)
+
+
+ # Validate conditioning arguments
+ if conditioning_media_paths:
+ # Use default strengths of 1.0
+ if not conditioning_strengths:
+ conditioning_strengths = [1.0] * len(conditioning_media_paths)
+ if not conditioning_start_frames:
+ raise ValueError(
+ "If `conditioning_media_paths` is provided, "
+ "`conditioning_start_frames` must also be provided"
+ )
+ if len(conditioning_media_paths) != len(conditioning_strengths) or len(
+ conditioning_media_paths
+ ) != len(conditioning_start_frames):
+ raise ValueError(
+ "`conditioning_media_paths`, `conditioning_strengths`, "
+ "and `conditioning_start_frames` must have the same length"
+ )
+ if any(s < 0 or s > 1 for s in conditioning_strengths):
+ raise ValueError("All conditioning strengths must be between 0 and 1")
+ if any(f < 0 or f >= frame_num for f in conditioning_start_frames):
+ raise ValueError(
+ f"All conditioning start frames must be between 0 and {frame_num-1}"
+ )
+
+ # Adjust dimensions to be divisible by 32 and num_frames to be (N * 8 + 1)
+ height_padded = ((height - 1) // 32 + 1) * 32
+ width_padded = ((width - 1) // 32 + 1) * 32
+ num_frames_padded = ((frame_num - 2) // 8 + 1) * 8 + 1
+
+ padding = calculate_padding(height, width, height_padded, width_padded)
+
+ logger.warning(
+ f"Padded dimensions: {height_padded}x{width_padded}x{num_frames_padded}"
+ )
+
+
+ # prompt_enhancement_words_threshold = pipeline_config[
+ # "prompt_enhancement_words_threshold"
+ # ]
+
+ # prompt_word_count = len(prompt.split())
+ # enhance_prompt = (
+ # prompt_enhancement_words_threshold > 0
+ # and prompt_word_count < prompt_enhancement_words_threshold
+ # )
+
+ # # enhance_prompt = False
+
+ # if prompt_enhancement_words_threshold > 0 and not enhance_prompt:
+ # logger.info(
+ # f"Prompt has {prompt_word_count} words, which exceeds the threshold of {prompt_enhancement_words_threshold}. Prompt enhancement disabled."
+ # )
+
+
+ seed_everething(seed)
+ device = device or get_device()
+ generator = torch.Generator(device=device).manual_seed(seed)
+
+ media_item = None
+ if input_media_path:
+ media_item = load_media_file(
+ media_path=input_media_path,
+ height=height,
+ width=width,
+ max_frames=num_frames_padded,
+ padding=padding,
+ )
+
+ conditioning_items = (
+ prepare_conditioning(
+ conditioning_media_paths=conditioning_media_paths,
+ conditioning_strengths=conditioning_strengths,
+ conditioning_start_frames=conditioning_start_frames,
+ height=height,
+ width=width,
+ num_frames=frame_num,
+ padding=padding,
+ pipeline=self.pipeline,
+ )
+ if conditioning_media_paths
+ else None
+ )
+
+ stg_mode = pipeline_config.get("stg_mode", "attention_values")
+ del pipeline_config["stg_mode"]
+ if stg_mode.lower() == "stg_av" or stg_mode.lower() == "attention_values":
+ skip_layer_strategy = SkipLayerStrategy.AttentionValues
+ elif stg_mode.lower() == "stg_as" or stg_mode.lower() == "attention_skip":
+ skip_layer_strategy = SkipLayerStrategy.AttentionSkip
+ elif stg_mode.lower() == "stg_r" or stg_mode.lower() == "residual":
+ skip_layer_strategy = SkipLayerStrategy.Residual
+ elif stg_mode.lower() == "stg_t" or stg_mode.lower() == "transformer_block":
+ skip_layer_strategy = SkipLayerStrategy.TransformerBlock
+ else:
+ raise ValueError(f"Invalid spatiotemporal guidance mode: {stg_mode}")
+
+ # Prepare input for the pipeline
+ sample = {
+ "prompt": input_prompt,
+ "prompt_attention_mask": None,
+ "negative_prompt": n_prompt,
+ "negative_prompt_attention_mask": None,
+ }
+
+
+ images = self.pipeline(
+ **pipeline_config,
+ ltxv_model = self,
+ num_inference_steps1 = num_inference_steps1,
+ num_inference_steps2 = num_inference_steps2,
+ skip_layer_strategy=skip_layer_strategy,
+ generator=generator,
+ output_type="pt",
+ callback_on_step_end=None,
+ height=height_padded,
+ width=width_padded,
+ num_frames=num_frames_padded,
+ frame_rate=frame_rate,
+ **sample,
+ media_items=media_item,
+ strength=strength,
+ conditioning_items=conditioning_items,
+ is_video=True,
+ vae_per_channel_normalize=True,
+ image_cond_noise_scale=image_cond_noise_scale,
+ mixed_precision=pipeline_config.get("mixed", self.mixed_precision_transformer),
+ callback=callback,
+ VAE_tile_size = VAE_tile_size,
+ device=device,
+ # enhance_prompt=enhance_prompt,
+ )
+ if images == None:
+ return None
+
+ # Crop the padded images to the desired resolution and number of frames
+ (pad_left, pad_right, pad_top, pad_bottom) = padding
+ pad_bottom = -pad_bottom
+ pad_right = -pad_right
+ if pad_bottom == 0:
+ pad_bottom = images.shape[3]
+ if pad_right == 0:
+ pad_right = images.shape[4]
+ images = images[:, :, :frame_num, pad_top:pad_bottom, pad_left:pad_right]
+ images = images.sub_(0.5).mul_(2).squeeze(0)
+ return images
+
+
+def prepare_conditioning(
+ conditioning_media_paths: List[str],
+ conditioning_strengths: List[float],
+ conditioning_start_frames: List[int],
+ height: int,
+ width: int,
+ num_frames: int,
+ padding: tuple[int, int, int, int],
+ pipeline: LTXVideoPipeline,
+) -> Optional[List[ConditioningItem]]:
+ """Prepare conditioning items based on input media paths and their parameters.
+
+ Args:
+ conditioning_media_paths: List of paths to conditioning media (images or videos)
+ conditioning_strengths: List of conditioning strengths for each media item
+ conditioning_start_frames: List of frame indices where each item should be applied
+ height: Height of the output frames
+ width: Width of the output frames
+ num_frames: Number of frames in the output video
+ padding: Padding to apply to the frames
+ pipeline: LTXVideoPipeline object used for condition video trimming
+
+ Returns:
+ A list of ConditioningItem objects.
+ """
+ conditioning_items = []
+ for path, strength, start_frame in zip(
+ conditioning_media_paths, conditioning_strengths, conditioning_start_frames
+ ):
+ if isinstance(path, Image.Image):
+ num_input_frames = orig_num_input_frames = 1
+ else:
+ num_input_frames = orig_num_input_frames = get_media_num_frames(path)
+ if hasattr(pipeline, "trim_conditioning_sequence") and callable(
+ getattr(pipeline, "trim_conditioning_sequence")
+ ):
+ num_input_frames = pipeline.trim_conditioning_sequence(
+ start_frame, orig_num_input_frames, num_frames
+ )
+ if num_input_frames < orig_num_input_frames:
+ logger.warning(
+ f"Trimming conditioning video {path} from {orig_num_input_frames} to {num_input_frames} frames."
+ )
+
+ media_tensor = load_media_file(
+ media_path=path,
+ height=height,
+ width=width,
+ max_frames=num_input_frames,
+ padding=padding,
+ just_crop=True,
+ )
+ conditioning_items.append(ConditioningItem(media_tensor, start_frame, strength))
+ return conditioning_items
+
+
+def get_media_num_frames(media_path: str) -> int:
+ if isinstance(media_path, Image.Image):
+ return 1
+ elif torch.is_tensor(media_path):
+ return media_path.shape[1]
+ elif isinstance(media_path, str) and any( media_path.lower().endswith(ext) for ext in [".mp4", ".avi", ".mov", ".mkv"]):
+ reader = imageio.get_reader(media_path)
+ return min(reader.count_frames(), max_frames)
+ else:
+ raise Exception("video format not supported")
+
+
+def load_media_file(
+ media_path: str,
+ height: int,
+ width: int,
+ max_frames: int,
+ padding: tuple[int, int, int, int],
+ just_crop: bool = False,
+) -> torch.Tensor:
+ if isinstance(media_path, Image.Image):
+ # Input image
+ media_tensor = load_image_to_tensor_with_resize_and_crop(
+ media_path, height, width, just_crop=just_crop
+ )
+ media_tensor = torch.nn.functional.pad(media_tensor, padding)
+
+ elif torch.is_tensor(media_path):
+ media_tensor = media_path.unsqueeze(0)
+ num_input_frames = media_tensor.shape[2]
+ elif isinstance(media_path, str) and any( media_path.lower().endswith(ext) for ext in [".mp4", ".avi", ".mov", ".mkv"]):
+ reader = imageio.get_reader(media_path)
+ num_input_frames = min(reader.count_frames(), max_frames)
+
+ # Read and preprocess the relevant frames from the video file.
+ frames = []
+ for i in range(num_input_frames):
+ frame = Image.fromarray(reader.get_data(i))
+ frame_tensor = load_image_to_tensor_with_resize_and_crop(
+ frame, height, width, just_crop=just_crop
+ )
+ frame_tensor = torch.nn.functional.pad(frame_tensor, padding)
+ frames.append(frame_tensor)
+ reader.close()
+
+ # Stack frames along the temporal dimension
+ media_tensor = torch.cat(frames, dim=2)
+ else:
+ raise Exception("video format not supported")
+ return media_tensor
+
+
+if __name__ == "__main__":
+ main()
diff --git a/ltx_video/models/__init__.py b/ltx_video/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ltx_video/models/autoencoders/__init__.py b/ltx_video/models/autoencoders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ltx_video/models/autoencoders/causal_conv3d.py b/ltx_video/models/autoencoders/causal_conv3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..98249c2f5ffe52eead83b38476e034c4f03bdccd
--- /dev/null
+++ b/ltx_video/models/autoencoders/causal_conv3d.py
@@ -0,0 +1,63 @@
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+
+
+class CausalConv3d(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ kernel_size: int = 3,
+ stride: Union[int, Tuple[int]] = 1,
+ dilation: int = 1,
+ groups: int = 1,
+ spatial_padding_mode: str = "zeros",
+ **kwargs,
+ ):
+ super().__init__()
+
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+
+ kernel_size = (kernel_size, kernel_size, kernel_size)
+ self.time_kernel_size = kernel_size[0]
+
+ dilation = (dilation, 1, 1)
+
+ height_pad = kernel_size[1] // 2
+ width_pad = kernel_size[2] // 2
+ padding = (0, height_pad, width_pad)
+
+ self.conv = nn.Conv3d(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=stride,
+ dilation=dilation,
+ padding=padding,
+ padding_mode=spatial_padding_mode,
+ groups=groups,
+ )
+
+ def forward(self, x, causal: bool = True):
+ if causal:
+ first_frame_pad = x[:, :, :1, :, :].repeat(
+ (1, 1, self.time_kernel_size - 1, 1, 1)
+ )
+ x = torch.concatenate((first_frame_pad, x), dim=2)
+ else:
+ first_frame_pad = x[:, :, :1, :, :].repeat(
+ (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
+ )
+ last_frame_pad = x[:, :, -1:, :, :].repeat(
+ (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
+ )
+ x = torch.concatenate((first_frame_pad, x, last_frame_pad), dim=2)
+ x = self.conv(x)
+ return x
+
+ @property
+ def weight(self):
+ return self.conv.weight
diff --git a/ltx_video/models/autoencoders/causal_video_autoencoder.py b/ltx_video/models/autoencoders/causal_video_autoencoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0edfe6a748335fca4e596dbe96d681bf36e4fa70
--- /dev/null
+++ b/ltx_video/models/autoencoders/causal_video_autoencoder.py
@@ -0,0 +1,1405 @@
+import json
+import os
+from functools import partial
+from types import SimpleNamespace
+from typing import Any, Mapping, Optional, Tuple, Union, List
+from pathlib import Path
+
+import torch
+import numpy as np
+from einops import rearrange
+from torch import nn
+from diffusers.utils import logging
+import torch.nn.functional as F
+from diffusers.models.embeddings import PixArtAlphaCombinedTimestepSizeEmbeddings
+from safetensors import safe_open
+
+
+from ltx_video.models.autoencoders.conv_nd_factory import make_conv_nd, make_linear_nd
+from ltx_video.models.autoencoders.pixel_norm import PixelNorm
+from ltx_video.models.autoencoders.pixel_shuffle import PixelShuffleND
+from ltx_video.models.autoencoders.vae import AutoencoderKLWrapper
+from ltx_video.models.transformers.attention import Attention
+from ltx_video.utils.diffusers_config_mapping import (
+ diffusers_and_ours_config_mapping,
+ make_hashable_key,
+ VAE_KEYS_RENAME_DICT,
+)
+
+PER_CHANNEL_STATISTICS_PREFIX = "per_channel_statistics."
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+class CausalVideoAutoencoder(AutoencoderKLWrapper):
+ @classmethod
+ def from_pretrained(
+ cls,
+ pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+ *args,
+ **kwargs,
+ ):
+ pretrained_model_name_or_path = Path(pretrained_model_name_or_path)
+ if (
+ pretrained_model_name_or_path.is_dir()
+ and (pretrained_model_name_or_path / "autoencoder.pth").exists()
+ ):
+ config_local_path = pretrained_model_name_or_path / "config.json"
+ config = cls.load_config(config_local_path, **kwargs)
+
+ model_local_path = pretrained_model_name_or_path / "autoencoder.pth"
+ state_dict = torch.load(model_local_path, map_location=torch.device("cpu"))
+
+ statistics_local_path = (
+ pretrained_model_name_or_path / "per_channel_statistics.json"
+ )
+ if statistics_local_path.exists():
+ with open(statistics_local_path, "r") as file:
+ data = json.load(file)
+ transposed_data = list(zip(*data["data"]))
+ data_dict = {
+ col: torch.tensor(vals)
+ for col, vals in zip(data["columns"], transposed_data)
+ }
+ std_of_means = data_dict["std-of-means"]
+ mean_of_means = data_dict.get(
+ "mean-of-means", torch.zeros_like(data_dict["std-of-means"])
+ )
+ state_dict[f"{PER_CHANNEL_STATISTICS_PREFIX}std-of-means"] = (
+ std_of_means
+ )
+ state_dict[f"{PER_CHANNEL_STATISTICS_PREFIX}mean-of-means"] = (
+ mean_of_means
+ )
+
+ elif pretrained_model_name_or_path.is_dir():
+ config_path = pretrained_model_name_or_path / "vae" / "config.json"
+ with open(config_path, "r") as f:
+ config = make_hashable_key(json.load(f))
+
+ assert config in diffusers_and_ours_config_mapping, (
+ "Provided diffusers checkpoint config for VAE is not suppported. "
+ "We only support diffusers configs found in Lightricks/LTX-Video."
+ )
+
+ config = diffusers_and_ours_config_mapping[config]
+
+ state_dict_path = (
+ pretrained_model_name_or_path
+ / "vae"
+ / "diffusion_pytorch_model.safetensors"
+ )
+
+ state_dict = {}
+ with safe_open(state_dict_path, framework="pt", device="cpu") as f:
+ for k in f.keys():
+ state_dict[k] = f.get_tensor(k)
+ for key in list(state_dict.keys()):
+ new_key = key
+ for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
+ new_key = new_key.replace(replace_key, rename_key)
+
+ state_dict[new_key] = state_dict.pop(key)
+
+ elif pretrained_model_name_or_path.is_file() and str(
+ pretrained_model_name_or_path
+ ).endswith(".safetensors"):
+ state_dict = {}
+ with safe_open(
+ pretrained_model_name_or_path, framework="pt", device="cpu"
+ ) as f:
+ metadata = f.metadata()
+ for k in f.keys():
+ state_dict[k] = f.get_tensor(k)
+ configs = json.loads(metadata["config"])
+ config = configs["vae"]
+
+ video_vae = cls.from_config(config)
+ if "torch_dtype" in kwargs:
+ video_vae.to(kwargs["torch_dtype"])
+ video_vae.load_state_dict(state_dict)
+ return video_vae
+
+ @staticmethod
+ def from_config(config):
+ assert (
+ config["_class_name"] == "CausalVideoAutoencoder"
+ ), "config must have _class_name=CausalVideoAutoencoder"
+ if isinstance(config["dims"], list):
+ config["dims"] = tuple(config["dims"])
+
+ assert config["dims"] in [2, 3, (2, 1)], "dims must be 2, 3 or (2, 1)"
+
+ double_z = config.get("double_z", True)
+ latent_log_var = config.get(
+ "latent_log_var", "per_channel" if double_z else "none"
+ )
+ use_quant_conv = config.get("use_quant_conv", True)
+ normalize_latent_channels = config.get("normalize_latent_channels", False)
+
+ if use_quant_conv and latent_log_var in ["uniform", "constant"]:
+ raise ValueError(
+ f"latent_log_var={latent_log_var} requires use_quant_conv=False"
+ )
+
+ encoder = Encoder(
+ dims=config["dims"],
+ in_channels=config.get("in_channels", 3),
+ out_channels=config["latent_channels"],
+ blocks=config.get("encoder_blocks", config.get("blocks")),
+ patch_size=config.get("patch_size", 1),
+ latent_log_var=latent_log_var,
+ norm_layer=config.get("norm_layer", "group_norm"),
+ base_channels=config.get("encoder_base_channels", 128),
+ spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
+ )
+
+ decoder = Decoder(
+ dims=config["dims"],
+ in_channels=config["latent_channels"],
+ out_channels=config.get("out_channels", 3),
+ blocks=config.get("decoder_blocks", config.get("blocks")),
+ patch_size=config.get("patch_size", 1),
+ norm_layer=config.get("norm_layer", "group_norm"),
+ causal=config.get("causal_decoder", False),
+ timestep_conditioning=config.get("timestep_conditioning", False),
+ base_channels=config.get("decoder_base_channels", 128),
+ spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
+ )
+
+ dims = config["dims"]
+ return CausalVideoAutoencoder(
+ encoder=encoder,
+ decoder=decoder,
+ latent_channels=config["latent_channels"],
+ dims=dims,
+ use_quant_conv=use_quant_conv,
+ normalize_latent_channels=normalize_latent_channels,
+ )
+
+ @property
+ def config(self):
+ return SimpleNamespace(
+ _class_name="CausalVideoAutoencoder",
+ dims=self.dims,
+ in_channels=self.encoder.conv_in.in_channels // self.encoder.patch_size**2,
+ out_channels=self.decoder.conv_out.out_channels
+ // self.decoder.patch_size**2,
+ latent_channels=self.decoder.conv_in.in_channels,
+ encoder_blocks=self.encoder.blocks_desc,
+ decoder_blocks=self.decoder.blocks_desc,
+ scaling_factor=1.0,
+ norm_layer=self.encoder.norm_layer,
+ patch_size=self.encoder.patch_size,
+ latent_log_var=self.encoder.latent_log_var,
+ use_quant_conv=self.use_quant_conv,
+ causal_decoder=self.decoder.causal,
+ timestep_conditioning=self.decoder.timestep_conditioning,
+ normalize_latent_channels=self.normalize_latent_channels,
+ )
+
+ @property
+ def is_video_supported(self):
+ """
+ Check if the model supports video inputs of shape (B, C, F, H, W). Otherwise, the model only supports 2D images.
+ """
+ return self.dims != 2
+
+ @property
+ def spatial_downscale_factor(self):
+ return (
+ 2
+ ** len(
+ [
+ block
+ for block in self.encoder.blocks_desc
+ if block[0]
+ in [
+ "compress_space",
+ "compress_all",
+ "compress_all_res",
+ "compress_space_res",
+ ]
+ ]
+ )
+ * self.encoder.patch_size
+ )
+
+ @property
+ def temporal_downscale_factor(self):
+ return 2 ** len(
+ [
+ block
+ for block in self.encoder.blocks_desc
+ if block[0]
+ in [
+ "compress_time",
+ "compress_all",
+ "compress_all_res",
+ "compress_space_res",
+ ]
+ ]
+ )
+
+ def to_json_string(self) -> str:
+ import json
+
+ return json.dumps(self.config.__dict__)
+
+ def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True, assign = True):
+ if any([key.startswith("vae.") for key in state_dict.keys()]):
+ state_dict = {
+ key.replace("vae.", ""): value
+ for key, value in state_dict.items()
+ if key.startswith("vae.")
+ }
+
+ ckpt_state_dict = {
+ key: value
+ for key, value in state_dict.items()
+ if not key.startswith(PER_CHANNEL_STATISTICS_PREFIX)
+ }
+
+ model_keys = set(name for name, _ in self.named_modules())
+
+ key_mapping = {
+ ".resnets.": ".res_blocks.",
+ "downsamplers.0": "downsample",
+ "upsamplers.0": "upsample",
+ }
+ converted_state_dict = {}
+ for key, value in ckpt_state_dict.items():
+ for k, v in key_mapping.items():
+ key = key.replace(k, v)
+
+ key_prefix = ".".join(key.split(".")[:-1])
+ if "norm" in key and key_prefix not in model_keys:
+ logger.info(
+ f"Removing key {key} from state_dict as it is not present in the model"
+ )
+ continue
+
+ converted_state_dict[key] = value
+
+ a,b = super().load_state_dict(converted_state_dict, strict=strict, assign=assign)
+
+ data_dict = {
+ key.removeprefix(PER_CHANNEL_STATISTICS_PREFIX): value
+ for key, value in state_dict.items()
+ if key.startswith(PER_CHANNEL_STATISTICS_PREFIX)
+ }
+ if len(data_dict) > 0:
+ self.register_buffer("std_of_means", data_dict["std-of-means"],)
+ self.register_buffer(
+ "mean_of_means",
+ data_dict.get(
+ "mean-of-means", torch.zeros_like(data_dict["std-of-means"])
+ ),
+ )
+ return a, b
+
+ def last_layer(self):
+ if hasattr(self.decoder, "conv_out"):
+ if isinstance(self.decoder.conv_out, nn.Sequential):
+ last_layer = self.decoder.conv_out[-1]
+ else:
+ last_layer = self.decoder.conv_out
+ else:
+ last_layer = self.decoder.layers[-1]
+ return last_layer
+
+ def set_use_tpu_flash_attention(self):
+ for block in self.decoder.up_blocks:
+ if isinstance(block, UNetMidBlock3D) and block.attention_blocks:
+ for attention_block in block.attention_blocks:
+ attention_block.set_use_tpu_flash_attention()
+
+
+class Encoder(nn.Module):
+ r"""
+ The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
+
+ Args:
+ dims (`int` or `Tuple[int, int]`, *optional*, defaults to 3):
+ The number of dimensions to use in convolutions.
+ in_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ out_channels (`int`, *optional*, defaults to 3):
+ The number of output channels.
+ blocks (`List[Tuple[str, int]]`, *optional*, defaults to `[("res_x", 1)]`):
+ The blocks to use. Each block is a tuple of the block name and the number of layers.
+ base_channels (`int`, *optional*, defaults to 128):
+ The number of output channels for the first convolutional layer.
+ norm_num_groups (`int`, *optional*, defaults to 32):
+ The number of groups for normalization.
+ patch_size (`int`, *optional*, defaults to 1):
+ The patch size to use. Should be a power of 2.
+ norm_layer (`str`, *optional*, defaults to `group_norm`):
+ The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+ latent_log_var (`str`, *optional*, defaults to `per_channel`):
+ The number of channels for the log variance. Can be either `per_channel`, `uniform`, `constant` or `none`.
+ """
+
+ def __init__(
+ self,
+ dims: Union[int, Tuple[int, int]] = 3,
+ in_channels: int = 3,
+ out_channels: int = 3,
+ blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
+ base_channels: int = 128,
+ norm_num_groups: int = 32,
+ patch_size: Union[int, Tuple[int]] = 1,
+ norm_layer: str = "group_norm", # group_norm, pixel_norm
+ latent_log_var: str = "per_channel",
+ spatial_padding_mode: str = "zeros",
+ ):
+ super().__init__()
+ self.patch_size = patch_size
+ self.norm_layer = norm_layer
+ self.latent_channels = out_channels
+ self.latent_log_var = latent_log_var
+ self.blocks_desc = blocks
+
+ in_channels = in_channels * patch_size**2
+ output_channel = base_channels
+
+ self.conv_in = make_conv_nd(
+ dims=dims,
+ in_channels=in_channels,
+ out_channels=output_channel,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ causal=True,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+
+ self.down_blocks = nn.ModuleList([])
+
+ for block_name, block_params in blocks:
+ input_channel = output_channel
+ if isinstance(block_params, int):
+ block_params = {"num_layers": block_params}
+
+ if block_name == "res_x":
+ block = UNetMidBlock3D(
+ dims=dims,
+ in_channels=input_channel,
+ num_layers=block_params["num_layers"],
+ resnet_eps=1e-6,
+ resnet_groups=norm_num_groups,
+ norm_layer=norm_layer,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ elif block_name == "res_x_y":
+ output_channel = block_params.get("multiplier", 2) * output_channel
+ block = ResnetBlock3D(
+ dims=dims,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ eps=1e-6,
+ groups=norm_num_groups,
+ norm_layer=norm_layer,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ elif block_name == "compress_time":
+ block = make_conv_nd(
+ dims=dims,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ kernel_size=3,
+ stride=(2, 1, 1),
+ causal=True,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ elif block_name == "compress_space":
+ block = make_conv_nd(
+ dims=dims,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ kernel_size=3,
+ stride=(1, 2, 2),
+ causal=True,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ elif block_name == "compress_all":
+ block = make_conv_nd(
+ dims=dims,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ kernel_size=3,
+ stride=(2, 2, 2),
+ causal=True,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ elif block_name == "compress_all_x_y":
+ output_channel = block_params.get("multiplier", 2) * output_channel
+ block = make_conv_nd(
+ dims=dims,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ kernel_size=3,
+ stride=(2, 2, 2),
+ causal=True,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ elif block_name == "compress_all_res":
+ output_channel = block_params.get("multiplier", 2) * output_channel
+ block = SpaceToDepthDownsample(
+ dims=dims,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ stride=(2, 2, 2),
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ elif block_name == "compress_space_res":
+ output_channel = block_params.get("multiplier", 2) * output_channel
+ block = SpaceToDepthDownsample(
+ dims=dims,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ stride=(1, 2, 2),
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ elif block_name == "compress_time_res":
+ output_channel = block_params.get("multiplier", 2) * output_channel
+ block = SpaceToDepthDownsample(
+ dims=dims,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ stride=(2, 1, 1),
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ else:
+ raise ValueError(f"unknown block: {block_name}")
+
+ self.down_blocks.append(block)
+
+ # out
+ if norm_layer == "group_norm":
+ self.conv_norm_out = nn.GroupNorm(
+ num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6
+ )
+ elif norm_layer == "pixel_norm":
+ self.conv_norm_out = PixelNorm()
+ elif norm_layer == "layer_norm":
+ self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)
+
+ self.conv_act = nn.SiLU()
+
+ conv_out_channels = out_channels
+ if latent_log_var == "per_channel":
+ conv_out_channels *= 2
+ elif latent_log_var == "uniform":
+ conv_out_channels += 1
+ elif latent_log_var == "constant":
+ conv_out_channels += 1
+ elif latent_log_var != "none":
+ raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
+ self.conv_out = make_conv_nd(
+ dims,
+ output_channel,
+ conv_out_channels,
+ 3,
+ padding=1,
+ causal=True,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+
+ self.gradient_checkpointing = False
+
+ def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+ r"""The forward method of the `Encoder` class."""
+
+ sample = patchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)
+ sample = self.conv_in(sample)
+
+ checkpoint_fn = (
+ partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+ if self.gradient_checkpointing and self.training
+ else lambda x: x
+ )
+
+ for down_block in self.down_blocks:
+ sample = checkpoint_fn(down_block)(sample)
+
+ sample = self.conv_norm_out(sample)
+ sample = self.conv_act(sample)
+ sample = self.conv_out(sample)
+
+ if self.latent_log_var == "uniform":
+ last_channel = sample[:, -1:, ...]
+ num_dims = sample.dim()
+
+ if num_dims == 4:
+ # For shape (B, C, H, W)
+ repeated_last_channel = last_channel.repeat(
+ 1, sample.shape[1] - 2, 1, 1
+ )
+ sample = torch.cat([sample, repeated_last_channel], dim=1)
+ elif num_dims == 5:
+ # For shape (B, C, F, H, W)
+ repeated_last_channel = last_channel.repeat(
+ 1, sample.shape[1] - 2, 1, 1, 1
+ )
+ sample = torch.cat([sample, repeated_last_channel], dim=1)
+ else:
+ raise ValueError(f"Invalid input shape: {sample.shape}")
+ elif self.latent_log_var == "constant":
+ sample = sample[:, :-1, ...]
+ approx_ln_0 = (
+ -30
+ ) # this is the minimal clamp value in DiagonalGaussianDistribution objects
+ sample = torch.cat(
+ [sample, torch.ones_like(sample, device=sample.device) * approx_ln_0],
+ dim=1,
+ )
+
+ return sample
+
+
+class Decoder(nn.Module):
+ r"""
+ The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
+
+ Args:
+ dims (`int` or `Tuple[int, int]`, *optional*, defaults to 3):
+ The number of dimensions to use in convolutions.
+ in_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ out_channels (`int`, *optional*, defaults to 3):
+ The number of output channels.
+ blocks (`List[Tuple[str, int]]`, *optional*, defaults to `[("res_x", 1)]`):
+ The blocks to use. Each block is a tuple of the block name and the number of layers.
+ base_channels (`int`, *optional*, defaults to 128):
+ The number of output channels for the first convolutional layer.
+ norm_num_groups (`int`, *optional*, defaults to 32):
+ The number of groups for normalization.
+ patch_size (`int`, *optional*, defaults to 1):
+ The patch size to use. Should be a power of 2.
+ norm_layer (`str`, *optional*, defaults to `group_norm`):
+ The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+ causal (`bool`, *optional*, defaults to `True`):
+ Whether to use causal convolutions or not.
+ """
+
+ def __init__(
+ self,
+ dims,
+ in_channels: int = 3,
+ out_channels: int = 3,
+ blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
+ base_channels: int = 128,
+ layers_per_block: int = 2,
+ norm_num_groups: int = 32,
+ patch_size: int = 1,
+ norm_layer: str = "group_norm",
+ causal: bool = True,
+ timestep_conditioning: bool = False,
+ spatial_padding_mode: str = "zeros",
+ ):
+ super().__init__()
+ self.patch_size = patch_size
+ self.layers_per_block = layers_per_block
+ out_channels = out_channels * patch_size**2
+ self.causal = causal
+ self.blocks_desc = blocks
+
+ # Compute output channel to be product of all channel-multiplier blocks
+ output_channel = base_channels
+ for block_name, block_params in list(reversed(blocks)):
+ block_params = block_params if isinstance(block_params, dict) else {}
+ if block_name == "res_x_y":
+ output_channel = output_channel * block_params.get("multiplier", 2)
+ if block_name == "compress_all":
+ output_channel = output_channel * block_params.get("multiplier", 1)
+
+ self.conv_in = make_conv_nd(
+ dims,
+ in_channels,
+ output_channel,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ causal=True,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+
+ self.up_blocks = nn.ModuleList([])
+
+ for block_name, block_params in list(reversed(blocks)):
+ input_channel = output_channel
+ if isinstance(block_params, int):
+ block_params = {"num_layers": block_params}
+
+ if block_name == "res_x":
+ block = UNetMidBlock3D(
+ dims=dims,
+ in_channels=input_channel,
+ num_layers=block_params["num_layers"],
+ resnet_eps=1e-6,
+ resnet_groups=norm_num_groups,
+ norm_layer=norm_layer,
+ inject_noise=block_params.get("inject_noise", False),
+ timestep_conditioning=timestep_conditioning,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ elif block_name == "attn_res_x":
+ block = UNetMidBlock3D(
+ dims=dims,
+ in_channels=input_channel,
+ num_layers=block_params["num_layers"],
+ resnet_groups=norm_num_groups,
+ norm_layer=norm_layer,
+ inject_noise=block_params.get("inject_noise", False),
+ timestep_conditioning=timestep_conditioning,
+ attention_head_dim=block_params["attention_head_dim"],
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ elif block_name == "res_x_y":
+ output_channel = output_channel // block_params.get("multiplier", 2)
+ block = ResnetBlock3D(
+ dims=dims,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ eps=1e-6,
+ groups=norm_num_groups,
+ norm_layer=norm_layer,
+ inject_noise=block_params.get("inject_noise", False),
+ timestep_conditioning=False,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ elif block_name == "compress_time":
+ block = DepthToSpaceUpsample(
+ dims=dims,
+ in_channels=input_channel,
+ stride=(2, 1, 1),
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ elif block_name == "compress_space":
+ block = DepthToSpaceUpsample(
+ dims=dims,
+ in_channels=input_channel,
+ stride=(1, 2, 2),
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ elif block_name == "compress_all":
+ output_channel = output_channel // block_params.get("multiplier", 1)
+ block = DepthToSpaceUpsample(
+ dims=dims,
+ in_channels=input_channel,
+ stride=(2, 2, 2),
+ residual=block_params.get("residual", False),
+ out_channels_reduction_factor=block_params.get("multiplier", 1),
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ else:
+ raise ValueError(f"unknown layer: {block_name}")
+
+ self.up_blocks.append(block)
+
+ if norm_layer == "group_norm":
+ self.conv_norm_out = nn.GroupNorm(
+ num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6
+ )
+ elif norm_layer == "pixel_norm":
+ self.conv_norm_out = PixelNorm()
+ elif norm_layer == "layer_norm":
+ self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)
+
+ self.conv_act = nn.SiLU()
+ self.conv_out = make_conv_nd(
+ dims,
+ output_channel,
+ out_channels,
+ 3,
+ padding=1,
+ causal=True,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+
+ self.gradient_checkpointing = False
+
+ self.timestep_conditioning = timestep_conditioning
+
+ if timestep_conditioning:
+ self.timestep_scale_multiplier = nn.Parameter(
+ torch.tensor(1000.0, dtype=torch.float32)
+ )
+ self.last_time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(
+ output_channel * 2, 0
+ )
+ self.last_scale_shift_table = nn.Parameter(
+ torch.randn(2, output_channel) / output_channel**0.5
+ )
+
+ def forward(
+ self,
+ sample: torch.FloatTensor,
+ target_shape,
+ timestep: Optional[torch.Tensor] = None,
+ ) -> torch.FloatTensor:
+ r"""The forward method of the `Decoder` class."""
+ assert target_shape is not None, "target_shape must be provided"
+ batch_size = sample.shape[0]
+
+ sample = self.conv_in(sample, causal=self.causal)
+
+ upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+
+ checkpoint_fn = (
+ partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+ if self.gradient_checkpointing and self.training
+ else lambda x: x
+ )
+
+ sample = sample.to(upscale_dtype)
+
+ if self.timestep_conditioning:
+ assert (
+ timestep is not None
+ ), "should pass timestep with timestep_conditioning=True"
+ scaled_timestep = timestep * self.timestep_scale_multiplier
+
+ for up_block in self.up_blocks:
+ if self.timestep_conditioning and isinstance(up_block, UNetMidBlock3D):
+ sample = checkpoint_fn(up_block)(
+ sample, causal=self.causal, timestep=scaled_timestep
+ )
+ else:
+ sample = checkpoint_fn(up_block)(sample, causal=self.causal)
+
+ sample = self.conv_norm_out(sample)
+
+ if self.timestep_conditioning:
+ embedded_timestep = self.last_time_embedder(
+ timestep=scaled_timestep.flatten(),
+ resolution=None,
+ aspect_ratio=None,
+ batch_size=sample.shape[0],
+ hidden_dtype=sample.dtype,
+ )
+ embedded_timestep = embedded_timestep.view(
+ batch_size, embedded_timestep.shape[-1], 1, 1, 1
+ )
+ ada_values = self.last_scale_shift_table[
+ None, ..., None, None, None
+ ] + embedded_timestep.reshape(
+ batch_size,
+ 2,
+ -1,
+ embedded_timestep.shape[-3],
+ embedded_timestep.shape[-2],
+ embedded_timestep.shape[-1],
+ )
+ shift, scale = ada_values.unbind(dim=1)
+ sample = sample * (1 + scale) + shift
+
+ sample = self.conv_act(sample)
+ sample = self.conv_out(sample, causal=self.causal)
+
+ sample = unpatchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)
+
+ return sample
+
+
+class UNetMidBlock3D(nn.Module):
+ """
+ A 3D UNet mid-block [`UNetMidBlock3D`] with multiple residual blocks.
+
+ Args:
+ in_channels (`int`): The number of input channels.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+ num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+ resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+ resnet_groups (`int`, *optional*, defaults to 32):
+ The number of groups to use in the group normalization layers of the resnet blocks.
+ norm_layer (`str`, *optional*, defaults to `group_norm`):
+ The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+ inject_noise (`bool`, *optional*, defaults to `False`):
+ Whether to inject noise into the hidden states.
+ timestep_conditioning (`bool`, *optional*, defaults to `False`):
+ Whether to condition the hidden states on the timestep.
+ attention_head_dim (`int`, *optional*, defaults to -1):
+ The dimension of the attention head. If -1, no attention is used.
+
+ Returns:
+ `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+ in_channels, height, width)`.
+
+ """
+
+ def __init__(
+ self,
+ dims: Union[int, Tuple[int, int]],
+ in_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_groups: int = 32,
+ norm_layer: str = "group_norm",
+ inject_noise: bool = False,
+ timestep_conditioning: bool = False,
+ attention_head_dim: int = -1,
+ spatial_padding_mode: str = "zeros",
+ ):
+ super().__init__()
+ resnet_groups = (
+ resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+ )
+ self.timestep_conditioning = timestep_conditioning
+
+ if timestep_conditioning:
+ self.time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(
+ in_channels * 4, 0
+ )
+
+ self.res_blocks = nn.ModuleList(
+ [
+ ResnetBlock3D(
+ dims=dims,
+ in_channels=in_channels,
+ out_channels=in_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ norm_layer=norm_layer,
+ inject_noise=inject_noise,
+ timestep_conditioning=timestep_conditioning,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ for _ in range(num_layers)
+ ]
+ )
+
+ self.attention_blocks = None
+
+ if attention_head_dim > 0:
+ if attention_head_dim > in_channels:
+ raise ValueError(
+ "attention_head_dim must be less than or equal to in_channels"
+ )
+
+ self.attention_blocks = nn.ModuleList(
+ [
+ Attention(
+ query_dim=in_channels,
+ heads=in_channels // attention_head_dim,
+ dim_head=attention_head_dim,
+ bias=True,
+ out_bias=True,
+ qk_norm="rms_norm",
+ residual_connection=True,
+ )
+ for _ in range(num_layers)
+ ]
+ )
+
+ def forward(
+ self,
+ hidden_states: torch.FloatTensor,
+ causal: bool = True,
+ timestep: Optional[torch.Tensor] = None,
+ ) -> torch.FloatTensor:
+ timestep_embed = None
+ if self.timestep_conditioning:
+ assert (
+ timestep is not None
+ ), "should pass timestep with timestep_conditioning=True"
+ batch_size = hidden_states.shape[0]
+ timestep_embed = self.time_embedder(
+ timestep=timestep.flatten(),
+ resolution=None,
+ aspect_ratio=None,
+ batch_size=batch_size,
+ hidden_dtype=hidden_states.dtype,
+ )
+ timestep_embed = timestep_embed.view(
+ batch_size, timestep_embed.shape[-1], 1, 1, 1
+ )
+
+ if self.attention_blocks:
+ for resnet, attention in zip(self.res_blocks, self.attention_blocks):
+ hidden_states = resnet(
+ hidden_states, causal=causal, timestep=timestep_embed
+ )
+
+ # Reshape the hidden states to be (batch_size, frames * height * width, channel)
+ batch_size, channel, frames, height, width = hidden_states.shape
+ hidden_states = hidden_states.view(
+ batch_size, channel, frames * height * width
+ ).transpose(1, 2)
+
+ if attention.use_tpu_flash_attention:
+ # Pad the second dimension to be divisible by block_k_major (block in flash attention)
+ seq_len = hidden_states.shape[1]
+ block_k_major = 512
+ pad_len = (block_k_major - seq_len % block_k_major) % block_k_major
+ if pad_len > 0:
+ hidden_states = F.pad(
+ hidden_states, (0, 0, 0, pad_len), "constant", 0
+ )
+
+ # Create a mask with ones for the original sequence length and zeros for the padded indexes
+ mask = torch.ones(
+ (hidden_states.shape[0], seq_len),
+ device=hidden_states.device,
+ dtype=hidden_states.dtype,
+ )
+ if pad_len > 0:
+ mask = F.pad(mask, (0, pad_len), "constant", 0)
+
+ hidden_states = attention(
+ hidden_states,
+ attention_mask=(
+ None if not attention.use_tpu_flash_attention else mask
+ ),
+ )
+
+ if attention.use_tpu_flash_attention:
+ # Remove the padding
+ if pad_len > 0:
+ hidden_states = hidden_states[:, :-pad_len, :]
+
+ # Reshape the hidden states back to (batch_size, channel, frames, height, width, channel)
+ hidden_states = hidden_states.transpose(-1, -2).reshape(
+ batch_size, channel, frames, height, width
+ )
+ else:
+ for resnet in self.res_blocks:
+ hidden_states = resnet(
+ hidden_states, causal=causal, timestep=timestep_embed
+ )
+
+ return hidden_states
+
+
+class SpaceToDepthDownsample(nn.Module):
+ def __init__(self, dims, in_channels, out_channels, stride, spatial_padding_mode):
+ super().__init__()
+ self.stride = stride
+ self.group_size = in_channels * np.prod(stride) // out_channels
+ self.conv = make_conv_nd(
+ dims=dims,
+ in_channels=in_channels,
+ out_channels=out_channels // np.prod(stride),
+ kernel_size=3,
+ stride=1,
+ causal=True,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+
+ def forward(self, x, causal: bool = True):
+ if self.stride[0] == 2:
+ x = torch.cat(
+ [x[:, :, :1, :, :], x], dim=2
+ ) # duplicate first frames for padding
+
+ # skip connection
+ x_in = rearrange(
+ x,
+ "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
+ p1=self.stride[0],
+ p2=self.stride[1],
+ p3=self.stride[2],
+ )
+ x_in = rearrange(x_in, "b (c g) d h w -> b c g d h w", g=self.group_size)
+ x_in = x_in.mean(dim=2)
+
+ # conv
+ x = self.conv(x, causal=causal)
+ x = rearrange(
+ x,
+ "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
+ p1=self.stride[0],
+ p2=self.stride[1],
+ p3=self.stride[2],
+ )
+
+ x = x + x_in
+
+ return x
+
+
+class DepthToSpaceUpsample(nn.Module):
+ def __init__(
+ self,
+ dims,
+ in_channels,
+ stride,
+ residual=False,
+ out_channels_reduction_factor=1,
+ spatial_padding_mode="zeros",
+ ):
+ super().__init__()
+ self.stride = stride
+ self.out_channels = (
+ np.prod(stride) * in_channels // out_channels_reduction_factor
+ )
+ self.conv = make_conv_nd(
+ dims=dims,
+ in_channels=in_channels,
+ out_channels=self.out_channels,
+ kernel_size=3,
+ stride=1,
+ causal=True,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ self.pixel_shuffle = PixelShuffleND(dims=dims, upscale_factors=stride)
+ self.residual = residual
+ self.out_channels_reduction_factor = out_channels_reduction_factor
+
+ def forward(self, x, causal: bool = True):
+ if self.residual:
+ # Reshape and duplicate the input to match the output shape
+ x_in = self.pixel_shuffle(x)
+ num_repeat = np.prod(self.stride) // self.out_channels_reduction_factor
+ x_in = x_in.repeat(1, num_repeat, 1, 1, 1)
+ if self.stride[0] == 2:
+ x_in = x_in[:, :, 1:, :, :]
+ x = self.conv(x, causal=causal)
+ x = self.pixel_shuffle(x)
+ if self.stride[0] == 2:
+ x = x[:, :, 1:, :, :]
+ if self.residual:
+ x = x + x_in
+ return x
+
+
+class LayerNorm(nn.Module):
+ def __init__(self, dim, eps, elementwise_affine=True) -> None:
+ super().__init__()
+ self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)
+
+ def forward(self, x):
+ x = rearrange(x, "b c d h w -> b d h w c")
+ x = self.norm(x)
+ x = rearrange(x, "b d h w c -> b c d h w")
+ return x
+
+
+class ResnetBlock3D(nn.Module):
+ r"""
+ A Resnet block.
+
+ Parameters:
+ in_channels (`int`): The number of channels in the input.
+ out_channels (`int`, *optional*, default to be `None`):
+ The number of output channels for the first conv layer. If None, same as `in_channels`.
+ dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+ groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+ eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+ """
+
+ def __init__(
+ self,
+ dims: Union[int, Tuple[int, int]],
+ in_channels: int,
+ out_channels: Optional[int] = None,
+ dropout: float = 0.0,
+ groups: int = 32,
+ eps: float = 1e-6,
+ norm_layer: str = "group_norm",
+ inject_noise: bool = False,
+ timestep_conditioning: bool = False,
+ spatial_padding_mode: str = "zeros",
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ out_channels = in_channels if out_channels is None else out_channels
+ self.out_channels = out_channels
+ self.inject_noise = inject_noise
+
+ if norm_layer == "group_norm":
+ self.norm1 = nn.GroupNorm(
+ num_groups=groups, num_channels=in_channels, eps=eps, affine=True
+ )
+ elif norm_layer == "pixel_norm":
+ self.norm1 = PixelNorm()
+ elif norm_layer == "layer_norm":
+ self.norm1 = LayerNorm(in_channels, eps=eps, elementwise_affine=True)
+
+ self.non_linearity = nn.SiLU()
+
+ self.conv1 = make_conv_nd(
+ dims,
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ causal=True,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+
+ if inject_noise:
+ self.per_channel_scale1 = nn.Parameter(torch.zeros((in_channels, 1, 1)))
+
+ if norm_layer == "group_norm":
+ self.norm2 = nn.GroupNorm(
+ num_groups=groups, num_channels=out_channels, eps=eps, affine=True
+ )
+ elif norm_layer == "pixel_norm":
+ self.norm2 = PixelNorm()
+ elif norm_layer == "layer_norm":
+ self.norm2 = LayerNorm(out_channels, eps=eps, elementwise_affine=True)
+
+ self.dropout = torch.nn.Dropout(dropout)
+
+ self.conv2 = make_conv_nd(
+ dims,
+ out_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ causal=True,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+
+ if inject_noise:
+ self.per_channel_scale2 = nn.Parameter(torch.zeros((in_channels, 1, 1)))
+
+ self.conv_shortcut = (
+ make_linear_nd(
+ dims=dims, in_channels=in_channels, out_channels=out_channels
+ )
+ if in_channels != out_channels
+ else nn.Identity()
+ )
+
+ self.norm3 = (
+ LayerNorm(in_channels, eps=eps, elementwise_affine=True)
+ if in_channels != out_channels
+ else nn.Identity()
+ )
+
+ self.timestep_conditioning = timestep_conditioning
+
+ if timestep_conditioning:
+ self.scale_shift_table = nn.Parameter(
+ torch.randn(4, in_channels) / in_channels**0.5
+ )
+
+ def _feed_spatial_noise(
+ self, hidden_states: torch.FloatTensor, per_channel_scale: torch.FloatTensor
+ ) -> torch.FloatTensor:
+ spatial_shape = hidden_states.shape[-2:]
+ device = hidden_states.device
+ dtype = hidden_states.dtype
+
+ # similar to the "explicit noise inputs" method in style-gan
+ spatial_noise = torch.randn(spatial_shape, device=device, dtype=dtype)[None]
+ scaled_noise = (spatial_noise * per_channel_scale)[None, :, None, ...]
+ hidden_states = hidden_states + scaled_noise
+
+ return hidden_states
+
+ def forward(
+ self,
+ input_tensor: torch.FloatTensor,
+ causal: bool = True,
+ timestep: Optional[torch.Tensor] = None,
+ ) -> torch.FloatTensor:
+ hidden_states = input_tensor
+ batch_size = hidden_states.shape[0]
+
+ hidden_states = self.norm1(hidden_states)
+ if self.timestep_conditioning:
+ assert (
+ timestep is not None
+ ), "should pass timestep with timestep_conditioning=True"
+ ada_values = self.scale_shift_table[
+ None, ..., None, None, None
+ ] + timestep.reshape(
+ batch_size,
+ 4,
+ -1,
+ timestep.shape[-3],
+ timestep.shape[-2],
+ timestep.shape[-1],
+ )
+ shift1, scale1, shift2, scale2 = ada_values.unbind(dim=1)
+
+ hidden_states = hidden_states * (1 + scale1) + shift1
+
+ hidden_states = self.non_linearity(hidden_states)
+
+ hidden_states = self.conv1(hidden_states, causal=causal)
+
+ if self.inject_noise:
+ hidden_states = self._feed_spatial_noise(
+ hidden_states, self.per_channel_scale1
+ )
+
+ hidden_states = self.norm2(hidden_states)
+
+ if self.timestep_conditioning:
+ hidden_states = hidden_states * (1 + scale2) + shift2
+
+ hidden_states = self.non_linearity(hidden_states)
+
+ hidden_states = self.dropout(hidden_states)
+
+ hidden_states = self.conv2(hidden_states, causal=causal)
+
+ if self.inject_noise:
+ hidden_states = self._feed_spatial_noise(
+ hidden_states, self.per_channel_scale2
+ )
+
+ input_tensor = self.norm3(input_tensor)
+
+ batch_size = input_tensor.shape[0]
+
+ input_tensor = self.conv_shortcut(input_tensor)
+
+ output_tensor = input_tensor + hidden_states
+
+ return output_tensor
+
+
+def patchify(x, patch_size_hw, patch_size_t=1):
+ if patch_size_hw == 1 and patch_size_t == 1:
+ return x
+ if x.dim() == 4:
+ x = rearrange(
+ x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size_hw, r=patch_size_hw
+ )
+ elif x.dim() == 5:
+ x = rearrange(
+ x,
+ "b c (f p) (h q) (w r) -> b (c p r q) f h w",
+ p=patch_size_t,
+ q=patch_size_hw,
+ r=patch_size_hw,
+ )
+ else:
+ raise ValueError(f"Invalid input shape: {x.shape}")
+
+ return x
+
+
+def unpatchify(x, patch_size_hw, patch_size_t=1):
+ if patch_size_hw == 1 and patch_size_t == 1:
+ return x
+
+ if x.dim() == 4:
+ x = rearrange(
+ x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size_hw, r=patch_size_hw
+ )
+ elif x.dim() == 5:
+ x = rearrange(
+ x,
+ "b (c p r q) f h w -> b c (f p) (h q) (w r)",
+ p=patch_size_t,
+ q=patch_size_hw,
+ r=patch_size_hw,
+ )
+
+ return x
+
+
+def create_video_autoencoder_demo_config(
+ latent_channels: int = 64,
+):
+ encoder_blocks = [
+ ("res_x", {"num_layers": 2}),
+ ("compress_space_res", {"multiplier": 2}),
+ ("res_x", {"num_layers": 2}),
+ ("compress_time_res", {"multiplier": 2}),
+ ("res_x", {"num_layers": 1}),
+ ("compress_all_res", {"multiplier": 2}),
+ ("res_x", {"num_layers": 1}),
+ ("compress_all_res", {"multiplier": 2}),
+ ("res_x", {"num_layers": 1}),
+ ]
+ decoder_blocks = [
+ ("res_x", {"num_layers": 2, "inject_noise": False}),
+ ("compress_all", {"residual": True, "multiplier": 2}),
+ ("res_x", {"num_layers": 2, "inject_noise": False}),
+ ("compress_all", {"residual": True, "multiplier": 2}),
+ ("res_x", {"num_layers": 2, "inject_noise": False}),
+ ("compress_all", {"residual": True, "multiplier": 2}),
+ ("res_x", {"num_layers": 2, "inject_noise": False}),
+ ]
+ return {
+ "_class_name": "CausalVideoAutoencoder",
+ "dims": 3,
+ "encoder_blocks": encoder_blocks,
+ "decoder_blocks": decoder_blocks,
+ "latent_channels": latent_channels,
+ "norm_layer": "pixel_norm",
+ "patch_size": 4,
+ "latent_log_var": "uniform",
+ "use_quant_conv": False,
+ "causal_decoder": False,
+ "timestep_conditioning": True,
+ "spatial_padding_mode": "replicate",
+ }
+
+
+def test_vae_patchify_unpatchify():
+ import torch
+
+ x = torch.randn(2, 3, 8, 64, 64)
+ x_patched = patchify(x, patch_size_hw=4, patch_size_t=4)
+ x_unpatched = unpatchify(x_patched, patch_size_hw=4, patch_size_t=4)
+ assert torch.allclose(x, x_unpatched)
+
+
+def demo_video_autoencoder_forward_backward():
+ # Configuration for the VideoAutoencoder
+ config = create_video_autoencoder_demo_config()
+
+ # Instantiate the VideoAutoencoder with the specified configuration
+ video_autoencoder = CausalVideoAutoencoder.from_config(config)
+
+ print(video_autoencoder)
+ video_autoencoder.eval()
+ # Print the total number of parameters in the video autoencoder
+ total_params = sum(p.numel() for p in video_autoencoder.parameters())
+ print(f"Total number of parameters in VideoAutoencoder: {total_params:,}")
+
+ # Create a mock input tensor simulating a batch of videos
+ # Shape: (batch_size, channels, depth, height, width)
+ # E.g., 4 videos, each with 3 color channels, 16 frames, and 64x64 pixels per frame
+ input_videos = torch.randn(2, 3, 17, 64, 64)
+
+ # Forward pass: encode and decode the input videos
+ latent = video_autoencoder.encode(input_videos).latent_dist.mode()
+ print(f"input shape={input_videos.shape}")
+ print(f"latent shape={latent.shape}")
+
+ timestep = torch.ones(input_videos.shape[0]) * 0.1
+ reconstructed_videos = video_autoencoder.decode(
+ latent, target_shape=input_videos.shape, timestep=timestep
+ ).sample
+
+ print(f"reconstructed shape={reconstructed_videos.shape}")
+
+ # Validate that single image gets treated the same way as first frame
+ input_image = input_videos[:, :, :1, :, :]
+ image_latent = video_autoencoder.encode(input_image).latent_dist.mode()
+ _ = video_autoencoder.decode(
+ image_latent, target_shape=image_latent.shape, timestep=timestep
+ ).sample
+
+ first_frame_latent = latent[:, :, :1, :, :]
+
+ assert torch.allclose(image_latent, first_frame_latent, atol=1e-6)
+ # assert torch.allclose(reconstructed_image, reconstructed_videos[:, :, :1, :, :], atol=1e-6)
+ # assert torch.allclose(image_latent, first_frame_latent, atol=1e-6)
+ # assert (reconstructed_image == reconstructed_videos[:, :, :1, :, :]).all()
+
+ # Calculate the loss (e.g., mean squared error)
+ loss = torch.nn.functional.mse_loss(input_videos, reconstructed_videos)
+
+ # Perform backward pass
+ loss.backward()
+
+ print(f"Demo completed with loss: {loss.item()}")
+
+
+# Ensure to call the demo function to execute the forward and backward pass
+if __name__ == "__main__":
+ demo_video_autoencoder_forward_backward()
diff --git a/ltx_video/models/autoencoders/conv_nd_factory.py b/ltx_video/models/autoencoders/conv_nd_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..718c69befd959c7466c4a57d71e46bb80bfe9fba
--- /dev/null
+++ b/ltx_video/models/autoencoders/conv_nd_factory.py
@@ -0,0 +1,90 @@
+from typing import Tuple, Union
+
+import torch
+
+from ltx_video.models.autoencoders.dual_conv3d import DualConv3d
+from ltx_video.models.autoencoders.causal_conv3d import CausalConv3d
+
+
+def make_conv_nd(
+ dims: Union[int, Tuple[int, int]],
+ in_channels: int,
+ out_channels: int,
+ kernel_size: int,
+ stride=1,
+ padding=0,
+ dilation=1,
+ groups=1,
+ bias=True,
+ causal=False,
+ spatial_padding_mode="zeros",
+ temporal_padding_mode="zeros",
+):
+ if not (spatial_padding_mode == temporal_padding_mode or causal):
+ raise NotImplementedError("spatial and temporal padding modes must be equal")
+ if dims == 2:
+ return torch.nn.Conv2d(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ groups=groups,
+ bias=bias,
+ padding_mode=spatial_padding_mode,
+ )
+ elif dims == 3:
+ if causal:
+ return CausalConv3d(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ groups=groups,
+ bias=bias,
+ spatial_padding_mode=spatial_padding_mode,
+ )
+ return torch.nn.Conv3d(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ groups=groups,
+ bias=bias,
+ padding_mode=spatial_padding_mode,
+ )
+ elif dims == (2, 1):
+ return DualConv3d(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ bias=bias,
+ padding_mode=spatial_padding_mode,
+ )
+ else:
+ raise ValueError(f"unsupported dimensions: {dims}")
+
+
+def make_linear_nd(
+ dims: int,
+ in_channels: int,
+ out_channels: int,
+ bias=True,
+):
+ if dims == 2:
+ return torch.nn.Conv2d(
+ in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias
+ )
+ elif dims == 3 or dims == (2, 1):
+ return torch.nn.Conv3d(
+ in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias
+ )
+ else:
+ raise ValueError(f"unsupported dimensions: {dims}")
diff --git a/ltx_video/models/autoencoders/dual_conv3d.py b/ltx_video/models/autoencoders/dual_conv3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcf889296750d3d7e553af37ecf77d1b10245af3
--- /dev/null
+++ b/ltx_video/models/autoencoders/dual_conv3d.py
@@ -0,0 +1,217 @@
+import math
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+
+class DualConv3d(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride: Union[int, Tuple[int, int, int]] = 1,
+ padding: Union[int, Tuple[int, int, int]] = 0,
+ dilation: Union[int, Tuple[int, int, int]] = 1,
+ groups=1,
+ bias=True,
+ padding_mode="zeros",
+ ):
+ super(DualConv3d, self).__init__()
+
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.padding_mode = padding_mode
+ # Ensure kernel_size, stride, padding, and dilation are tuples of length 3
+ if isinstance(kernel_size, int):
+ kernel_size = (kernel_size, kernel_size, kernel_size)
+ if kernel_size == (1, 1, 1):
+ raise ValueError(
+ "kernel_size must be greater than 1. Use make_linear_nd instead."
+ )
+ if isinstance(stride, int):
+ stride = (stride, stride, stride)
+ if isinstance(padding, int):
+ padding = (padding, padding, padding)
+ if isinstance(dilation, int):
+ dilation = (dilation, dilation, dilation)
+
+ # Set parameters for convolutions
+ self.groups = groups
+ self.bias = bias
+
+ # Define the size of the channels after the first convolution
+ intermediate_channels = (
+ out_channels if in_channels < out_channels else in_channels
+ )
+
+ # Define parameters for the first convolution
+ self.weight1 = nn.Parameter(
+ torch.Tensor(
+ intermediate_channels,
+ in_channels // groups,
+ 1,
+ kernel_size[1],
+ kernel_size[2],
+ )
+ )
+ self.stride1 = (1, stride[1], stride[2])
+ self.padding1 = (0, padding[1], padding[2])
+ self.dilation1 = (1, dilation[1], dilation[2])
+ if bias:
+ self.bias1 = nn.Parameter(torch.Tensor(intermediate_channels))
+ else:
+ self.register_parameter("bias1", None)
+
+ # Define parameters for the second convolution
+ self.weight2 = nn.Parameter(
+ torch.Tensor(
+ out_channels, intermediate_channels // groups, kernel_size[0], 1, 1
+ )
+ )
+ self.stride2 = (stride[0], 1, 1)
+ self.padding2 = (padding[0], 0, 0)
+ self.dilation2 = (dilation[0], 1, 1)
+ if bias:
+ self.bias2 = nn.Parameter(torch.Tensor(out_channels))
+ else:
+ self.register_parameter("bias2", None)
+
+ # Initialize weights and biases
+ self.reset_parameters()
+
+ def reset_parameters(self):
+ nn.init.kaiming_uniform_(self.weight1, a=math.sqrt(5))
+ nn.init.kaiming_uniform_(self.weight2, a=math.sqrt(5))
+ if self.bias:
+ fan_in1, _ = nn.init._calculate_fan_in_and_fan_out(self.weight1)
+ bound1 = 1 / math.sqrt(fan_in1)
+ nn.init.uniform_(self.bias1, -bound1, bound1)
+ fan_in2, _ = nn.init._calculate_fan_in_and_fan_out(self.weight2)
+ bound2 = 1 / math.sqrt(fan_in2)
+ nn.init.uniform_(self.bias2, -bound2, bound2)
+
+ def forward(self, x, use_conv3d=False, skip_time_conv=False):
+ if use_conv3d:
+ return self.forward_with_3d(x=x, skip_time_conv=skip_time_conv)
+ else:
+ return self.forward_with_2d(x=x, skip_time_conv=skip_time_conv)
+
+ def forward_with_3d(self, x, skip_time_conv):
+ # First convolution
+ x = F.conv3d(
+ x,
+ self.weight1,
+ self.bias1,
+ self.stride1,
+ self.padding1,
+ self.dilation1,
+ self.groups,
+ padding_mode=self.padding_mode,
+ )
+
+ if skip_time_conv:
+ return x
+
+ # Second convolution
+ x = F.conv3d(
+ x,
+ self.weight2,
+ self.bias2,
+ self.stride2,
+ self.padding2,
+ self.dilation2,
+ self.groups,
+ padding_mode=self.padding_mode,
+ )
+
+ return x
+
+ def forward_with_2d(self, x, skip_time_conv):
+ b, c, d, h, w = x.shape
+
+ # First 2D convolution
+ x = rearrange(x, "b c d h w -> (b d) c h w")
+ # Squeeze the depth dimension out of weight1 since it's 1
+ weight1 = self.weight1.squeeze(2)
+ # Select stride, padding, and dilation for the 2D convolution
+ stride1 = (self.stride1[1], self.stride1[2])
+ padding1 = (self.padding1[1], self.padding1[2])
+ dilation1 = (self.dilation1[1], self.dilation1[2])
+ x = F.conv2d(
+ x,
+ weight1,
+ self.bias1,
+ stride1,
+ padding1,
+ dilation1,
+ self.groups,
+ padding_mode=self.padding_mode,
+ )
+
+ _, _, h, w = x.shape
+
+ if skip_time_conv:
+ x = rearrange(x, "(b d) c h w -> b c d h w", b=b)
+ return x
+
+ # Second convolution which is essentially treated as a 1D convolution across the 'd' dimension
+ x = rearrange(x, "(b d) c h w -> (b h w) c d", b=b)
+
+ # Reshape weight2 to match the expected dimensions for conv1d
+ weight2 = self.weight2.squeeze(-1).squeeze(-1)
+ # Use only the relevant dimension for stride, padding, and dilation for the 1D convolution
+ stride2 = self.stride2[0]
+ padding2 = self.padding2[0]
+ dilation2 = self.dilation2[0]
+ x = F.conv1d(
+ x,
+ weight2,
+ self.bias2,
+ stride2,
+ padding2,
+ dilation2,
+ self.groups,
+ padding_mode=self.padding_mode,
+ )
+ x = rearrange(x, "(b h w) c d -> b c d h w", b=b, h=h, w=w)
+
+ return x
+
+ @property
+ def weight(self):
+ return self.weight2
+
+
+def test_dual_conv3d_consistency():
+ # Initialize parameters
+ in_channels = 3
+ out_channels = 5
+ kernel_size = (3, 3, 3)
+ stride = (2, 2, 2)
+ padding = (1, 1, 1)
+
+ # Create an instance of the DualConv3d class
+ dual_conv3d = DualConv3d(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ bias=True,
+ )
+
+ # Example input tensor
+ test_input = torch.randn(1, 3, 10, 10, 10)
+
+ # Perform forward passes with both 3D and 2D settings
+ output_conv3d = dual_conv3d(test_input, use_conv3d=True)
+ output_2d = dual_conv3d(test_input, use_conv3d=False)
+
+ # Assert that the outputs from both methods are sufficiently close
+ assert torch.allclose(
+ output_conv3d, output_2d, atol=1e-6
+ ), "Outputs are not consistent between 3D and 2D convolutions."
diff --git a/ltx_video/models/autoencoders/latent_upsampler.py b/ltx_video/models/autoencoders/latent_upsampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a76bc21d1a503d61dec673cf5cb980bb6d703fd
--- /dev/null
+++ b/ltx_video/models/autoencoders/latent_upsampler.py
@@ -0,0 +1,203 @@
+from typing import Optional, Union
+from pathlib import Path
+import os
+import json
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+from diffusers import ConfigMixin, ModelMixin
+from safetensors.torch import safe_open
+
+from ltx_video.models.autoencoders.pixel_shuffle import PixelShuffleND
+
+
+class ResBlock(nn.Module):
+ def __init__(
+ self, channels: int, mid_channels: Optional[int] = None, dims: int = 3
+ ):
+ super().__init__()
+ if mid_channels is None:
+ mid_channels = channels
+
+ Conv = nn.Conv2d if dims == 2 else nn.Conv3d
+
+ self.conv1 = Conv(channels, mid_channels, kernel_size=3, padding=1)
+ self.norm1 = nn.GroupNorm(32, mid_channels)
+ self.conv2 = Conv(mid_channels, channels, kernel_size=3, padding=1)
+ self.norm2 = nn.GroupNorm(32, channels)
+ self.activation = nn.SiLU()
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ residual = x
+ x = self.conv1(x)
+ x = self.norm1(x)
+ x = self.activation(x)
+ x = self.conv2(x)
+ x = self.norm2(x)
+ x = self.activation(x + residual)
+ return x
+
+
+class LatentUpsampler(ModelMixin, ConfigMixin):
+ """
+ Model to spatially upsample VAE latents.
+
+ Args:
+ in_channels (`int`): Number of channels in the input latent
+ mid_channels (`int`): Number of channels in the middle layers
+ num_blocks_per_stage (`int`): Number of ResBlocks to use in each stage (pre/post upsampling)
+ dims (`int`): Number of dimensions for convolutions (2 or 3)
+ spatial_upsample (`bool`): Whether to spatially upsample the latent
+ temporal_upsample (`bool`): Whether to temporally upsample the latent
+ """
+
+ def __init__(
+ self,
+ in_channels: int = 128,
+ mid_channels: int = 512,
+ num_blocks_per_stage: int = 4,
+ dims: int = 3,
+ spatial_upsample: bool = True,
+ temporal_upsample: bool = False,
+ ):
+ super().__init__()
+
+ self.in_channels = in_channels
+ self.mid_channels = mid_channels
+ self.num_blocks_per_stage = num_blocks_per_stage
+ self.dims = dims
+ self.spatial_upsample = spatial_upsample
+ self.temporal_upsample = temporal_upsample
+
+ Conv = nn.Conv2d if dims == 2 else nn.Conv3d
+
+ self.initial_conv = Conv(in_channels, mid_channels, kernel_size=3, padding=1)
+ self.initial_norm = nn.GroupNorm(32, mid_channels)
+ self.initial_activation = nn.SiLU()
+
+ self.res_blocks = nn.ModuleList(
+ [ResBlock(mid_channels, dims=dims) for _ in range(num_blocks_per_stage)]
+ )
+
+ if spatial_upsample and temporal_upsample:
+ self.upsampler = nn.Sequential(
+ nn.Conv3d(mid_channels, 8 * mid_channels, kernel_size=3, padding=1),
+ PixelShuffleND(3),
+ )
+ elif spatial_upsample:
+ self.upsampler = nn.Sequential(
+ nn.Conv2d(mid_channels, 4 * mid_channels, kernel_size=3, padding=1),
+ PixelShuffleND(2),
+ )
+ elif temporal_upsample:
+ self.upsampler = nn.Sequential(
+ nn.Conv3d(mid_channels, 2 * mid_channels, kernel_size=3, padding=1),
+ PixelShuffleND(1),
+ )
+ else:
+ raise ValueError(
+ "Either spatial_upsample or temporal_upsample must be True"
+ )
+
+ self.post_upsample_res_blocks = nn.ModuleList(
+ [ResBlock(mid_channels, dims=dims) for _ in range(num_blocks_per_stage)]
+ )
+
+ self.final_conv = Conv(mid_channels, in_channels, kernel_size=3, padding=1)
+
+ def forward(self, latent: torch.Tensor) -> torch.Tensor:
+ b, c, f, h, w = latent.shape
+
+ if self.dims == 2:
+ x = rearrange(latent, "b c f h w -> (b f) c h w")
+ x = self.initial_conv(x)
+ x = self.initial_norm(x)
+ x = self.initial_activation(x)
+
+ for block in self.res_blocks:
+ x = block(x)
+
+ x = self.upsampler(x)
+
+ for block in self.post_upsample_res_blocks:
+ x = block(x)
+
+ x = self.final_conv(x)
+ x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f)
+ else:
+ x = self.initial_conv(latent)
+ x = self.initial_norm(x)
+ x = self.initial_activation(x)
+
+ for block in self.res_blocks:
+ x = block(x)
+
+ if self.temporal_upsample:
+ x = self.upsampler(x)
+ x = x[:, :, 1:, :, :]
+ else:
+ x = rearrange(x, "b c f h w -> (b f) c h w")
+ x = self.upsampler(x)
+ x = rearrange(x, "(b f) c h w -> b c f h w", b=b, f=f)
+
+ for block in self.post_upsample_res_blocks:
+ x = block(x)
+
+ x = self.final_conv(x)
+
+ return x
+
+ @classmethod
+ def from_config(cls, config):
+ return cls(
+ in_channels=config.get("in_channels", 4),
+ mid_channels=config.get("mid_channels", 128),
+ num_blocks_per_stage=config.get("num_blocks_per_stage", 4),
+ dims=config.get("dims", 2),
+ spatial_upsample=config.get("spatial_upsample", True),
+ temporal_upsample=config.get("temporal_upsample", False),
+ )
+
+ def config(self):
+ return {
+ "_class_name": "LatentUpsampler",
+ "in_channels": self.in_channels,
+ "mid_channels": self.mid_channels,
+ "num_blocks_per_stage": self.num_blocks_per_stage,
+ "dims": self.dims,
+ "spatial_upsample": self.spatial_upsample,
+ "temporal_upsample": self.temporal_upsample,
+ }
+
+ @classmethod
+ def from_pretrained(
+ cls,
+ pretrained_model_path: Optional[Union[str, os.PathLike]],
+ *args,
+ **kwargs,
+ ):
+ pretrained_model_path = Path(pretrained_model_path)
+ if pretrained_model_path.is_file() and str(pretrained_model_path).endswith(
+ ".safetensors"
+ ):
+ state_dict = {}
+ with safe_open(pretrained_model_path, framework="pt", device="cpu") as f:
+ metadata = f.metadata()
+ for k in f.keys():
+ state_dict[k] = f.get_tensor(k)
+ config = json.loads(metadata["config"])
+ with torch.device("meta"):
+ latent_upsampler = LatentUpsampler.from_config(config)
+ latent_upsampler.load_state_dict(state_dict, assign=True)
+ return latent_upsampler
+
+
+if __name__ == "__main__":
+ latent_upsampler = LatentUpsampler(num_blocks_per_stage=4, dims=3)
+ print(latent_upsampler)
+ total_params = sum(p.numel() for p in latent_upsampler.parameters())
+ print(f"Total number of parameters: {total_params:,}")
+ latent = torch.randn(1, 128, 9, 16, 16)
+ upsampled_latent = latent_upsampler(latent)
+ print(f"Upsampled latent shape: {upsampled_latent.shape}")
diff --git a/ltx_video/models/autoencoders/pixel_norm.py b/ltx_video/models/autoencoders/pixel_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bc3ea60e8a6453e7e12a7fb5aca4de3958a2567
--- /dev/null
+++ b/ltx_video/models/autoencoders/pixel_norm.py
@@ -0,0 +1,12 @@
+import torch
+from torch import nn
+
+
+class PixelNorm(nn.Module):
+ def __init__(self, dim=1, eps=1e-8):
+ super(PixelNorm, self).__init__()
+ self.dim = dim
+ self.eps = eps
+
+ def forward(self, x):
+ return x / torch.sqrt(torch.mean(x**2, dim=self.dim, keepdim=True) + self.eps)
diff --git a/ltx_video/models/autoencoders/pixel_shuffle.py b/ltx_video/models/autoencoders/pixel_shuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e79ae28483d5ad684ea68092bc955ef025722e6
--- /dev/null
+++ b/ltx_video/models/autoencoders/pixel_shuffle.py
@@ -0,0 +1,33 @@
+import torch.nn as nn
+from einops import rearrange
+
+
+class PixelShuffleND(nn.Module):
+ def __init__(self, dims, upscale_factors=(2, 2, 2)):
+ super().__init__()
+ assert dims in [1, 2, 3], "dims must be 1, 2, or 3"
+ self.dims = dims
+ self.upscale_factors = upscale_factors
+
+ def forward(self, x):
+ if self.dims == 3:
+ return rearrange(
+ x,
+ "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
+ p1=self.upscale_factors[0],
+ p2=self.upscale_factors[1],
+ p3=self.upscale_factors[2],
+ )
+ elif self.dims == 2:
+ return rearrange(
+ x,
+ "b (c p1 p2) h w -> b c (h p1) (w p2)",
+ p1=self.upscale_factors[0],
+ p2=self.upscale_factors[1],
+ )
+ elif self.dims == 1:
+ return rearrange(
+ x,
+ "b (c p1) f h w -> b c (f p1) h w",
+ p1=self.upscale_factors[0],
+ )
diff --git a/ltx_video/models/autoencoders/vae.py b/ltx_video/models/autoencoders/vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b19ba4e514556628f5c63739f3fc1152462f80b
--- /dev/null
+++ b/ltx_video/models/autoencoders/vae.py
@@ -0,0 +1,443 @@
+from typing import Optional, Union
+
+import torch
+import inspect
+import math
+import torch.nn as nn
+from diffusers import ConfigMixin, ModelMixin
+from diffusers.models.autoencoders.vae import (
+ DecoderOutput,
+ DiagonalGaussianDistribution,
+)
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from ltx_video.models.autoencoders.conv_nd_factory import make_conv_nd
+
+
+class AutoencoderKLWrapper(ModelMixin, ConfigMixin):
+ """Variational Autoencoder (VAE) model with KL loss.
+
+ VAE from the paper Auto-Encoding Variational Bayes by Diederik P. Kingma and Max Welling.
+ This model is a wrapper around an encoder and a decoder, and it adds a KL loss term to the reconstruction loss.
+
+ Args:
+ encoder (`nn.Module`):
+ Encoder module.
+ decoder (`nn.Module`):
+ Decoder module.
+ latent_channels (`int`, *optional*, defaults to 4):
+ Number of latent channels.
+ """
+
+ def __init__(
+ self,
+ encoder: nn.Module,
+ decoder: nn.Module,
+ latent_channels: int = 4,
+ dims: int = 2,
+ sample_size=512,
+ use_quant_conv: bool = True,
+ normalize_latent_channels: bool = False,
+ ):
+ super().__init__()
+
+
+ self.per_channel_statistics = nn.Module()
+ std_of_means = torch.zeros( (128,), dtype= torch.bfloat16)
+
+ self.per_channel_statistics.register_buffer("std-of-means", std_of_means)
+ self.per_channel_statistics.register_buffer(
+ "mean-of-means",
+ torch.zeros_like(std_of_means)
+ )
+
+
+
+ # pass init params to Encoder
+ self.encoder = encoder
+ self.use_quant_conv = use_quant_conv
+ self.normalize_latent_channels = normalize_latent_channels
+
+ # pass init params to Decoder
+ quant_dims = 2 if dims == 2 else 3
+ self.decoder = decoder
+ if use_quant_conv:
+ self.quant_conv = make_conv_nd(
+ quant_dims, 2 * latent_channels, 2 * latent_channels, 1
+ )
+ self.post_quant_conv = make_conv_nd(
+ quant_dims, latent_channels, latent_channels, 1
+ )
+ else:
+ self.quant_conv = nn.Identity()
+ self.post_quant_conv = nn.Identity()
+
+ if normalize_latent_channels:
+ if dims == 2:
+ self.latent_norm_out = nn.BatchNorm2d(latent_channels, affine=False)
+ else:
+ self.latent_norm_out = nn.BatchNorm3d(latent_channels, affine=False)
+ else:
+ self.latent_norm_out = nn.Identity()
+ self.use_z_tiling = False
+ self.use_hw_tiling = False
+ self.dims = dims
+ self.z_sample_size = 1
+
+ self.decoder_params = inspect.signature(self.decoder.forward).parameters
+
+ # only relevant if vae tiling is enabled
+ self.set_tiling_params(sample_size=sample_size, overlap_factor=0.25)
+
+ @staticmethod
+ def get_VAE_tile_size(vae_config, device_mem_capacity, mixed_precision):
+
+ z_tile = 4
+ # VAE Tiling
+ if vae_config == 0:
+ if mixed_precision:
+ device_mem_capacity = device_mem_capacity / 1.5
+ if device_mem_capacity >= 24000:
+ use_vae_config = 1
+ elif device_mem_capacity >= 8000:
+ use_vae_config = 2
+ else:
+ use_vae_config = 3
+ else:
+ use_vae_config = vae_config
+
+ if use_vae_config == 1:
+ hw_tile = 0
+ elif use_vae_config == 2:
+ hw_tile = 512
+ else:
+ hw_tile = 256
+
+ return (z_tile, hw_tile)
+
+ def set_tiling_params(self, sample_size: int = 512, overlap_factor: float = 0.25):
+ self.tile_sample_min_size = sample_size
+ num_blocks = len(self.encoder.down_blocks)
+ # self.tile_latent_min_size = int(sample_size / (2 ** (num_blocks - 1)))
+ self.tile_latent_min_size = int(sample_size / 32)
+ self.tile_overlap_factor = overlap_factor
+
+ def enable_z_tiling(self, z_sample_size: int = 4):
+ r"""
+ Enable tiling during VAE decoding.
+
+ When this option is enabled, the VAE will split the input tensor in tiles to compute decoding in several
+ steps. This is useful to save some memory and allow larger batch sizes.
+ """
+ self.use_z_tiling = z_sample_size > 1
+ self.z_sample_size = z_sample_size
+ assert (
+ z_sample_size % 4 == 0 or z_sample_size == 1
+ ), f"z_sample_size must be a multiple of 4 or 1. Got {z_sample_size}."
+
+ def disable_z_tiling(self):
+ r"""
+ Disable tiling during VAE decoding. If `use_tiling` was previously invoked, this method will go back to computing
+ decoding in one step.
+ """
+ self.use_z_tiling = False
+
+ def enable_hw_tiling(self):
+ r"""
+ Enable tiling during VAE decoding along the height and width dimension.
+ """
+ self.use_hw_tiling = True
+
+ def disable_hw_tiling(self):
+ r"""
+ Disable tiling during VAE decoding along the height and width dimension.
+ """
+ self.use_hw_tiling = False
+
+ def _hw_tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True):
+ overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+ blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+ row_limit = self.tile_latent_min_size - blend_extent
+
+ # Split the image into 512x512 tiles and encode them separately.
+ rows = []
+ for i in range(0, x.shape[3], overlap_size):
+ row = []
+ for j in range(0, x.shape[4], overlap_size):
+ tile = x[
+ :,
+ :,
+ :,
+ i : i + self.tile_sample_min_size,
+ j : j + self.tile_sample_min_size,
+ ]
+ tile = self.encoder(tile)
+ tile = self.quant_conv(tile)
+ row.append(tile)
+ rows.append(row)
+ result_rows = []
+ for i, row in enumerate(rows):
+ result_row = []
+ for j, tile in enumerate(row):
+ # blend the above tile and the left tile
+ # to the current tile and add the current tile to the result row
+ if i > 0:
+ tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+ if j > 0:
+ tile = self.blend_h(row[j - 1], tile, blend_extent)
+ result_row.append(tile[:, :, :, :row_limit, :row_limit])
+ result_rows.append(torch.cat(result_row, dim=4))
+
+ moments = torch.cat(result_rows, dim=3)
+ return moments
+
+ def blend_z(
+ self, a: torch.Tensor, b: torch.Tensor, blend_extent: int
+ ) -> torch.Tensor:
+ blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+ for z in range(blend_extent):
+ b[:, :, z, :, :] = a[:, :, -blend_extent + z, :, :] * (
+ 1 - z / blend_extent
+ ) + b[:, :, z, :, :] * (z / blend_extent)
+ return b
+
+ def blend_v(
+ self, a: torch.Tensor, b: torch.Tensor, blend_extent: int
+ ) -> torch.Tensor:
+ blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+ for y in range(blend_extent):
+ b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (
+ 1 - y / blend_extent
+ ) + b[:, :, :, y, :] * (y / blend_extent)
+ return b
+
+ def blend_h(
+ self, a: torch.Tensor, b: torch.Tensor, blend_extent: int
+ ) -> torch.Tensor:
+ blend_extent = min(a.shape[4], b.shape[4], blend_extent)
+ for x in range(blend_extent):
+ b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (
+ 1 - x / blend_extent
+ ) + b[:, :, :, :, x] * (x / blend_extent)
+ return b
+
+ def _hw_tiled_decode(self, z: torch.FloatTensor, target_shape, timestep = None):
+ overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+ blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+ row_limit = self.tile_sample_min_size - blend_extent
+ tile_target_shape = (
+ *target_shape[:3],
+ self.tile_sample_min_size,
+ self.tile_sample_min_size,
+ )
+ # Split z into overlapping 64x64 tiles and decode them separately.
+ # The tiles have an overlap to avoid seams between tiles.
+ rows = []
+ for i in range(0, z.shape[3], overlap_size):
+ row = []
+ for j in range(0, z.shape[4], overlap_size):
+ tile = z[
+ :,
+ :,
+ :,
+ i : i + self.tile_latent_min_size,
+ j : j + self.tile_latent_min_size,
+ ]
+ tile = self.post_quant_conv(tile)
+ decoded = self.decoder(tile, target_shape=tile_target_shape, timestep = timestep)
+ row.append(decoded)
+ rows.append(row)
+ result_rows = []
+ for i, row in enumerate(rows):
+ result_row = []
+ for j, tile in enumerate(row):
+ # blend the above tile and the left tile
+ # to the current tile and add the current tile to the result row
+ if i > 0:
+ tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+ if j > 0:
+ tile = self.blend_h(row[j - 1], tile, blend_extent)
+ result_row.append(tile[:, :, :, :row_limit, :row_limit])
+ result_rows.append(torch.cat(result_row, dim=4))
+
+ dec = torch.cat(result_rows, dim=3)
+ return dec
+
+ def encode(
+ self, z: torch.FloatTensor, return_dict: bool = True
+ ) -> Union[DecoderOutput, torch.FloatTensor]:
+ if self.use_z_tiling and z.shape[2] > (self.z_sample_size + 1) > 1:
+ tile_latent_min_tsize = self.z_sample_size
+ tile_sample_min_tsize = tile_latent_min_tsize * 8
+ tile_overlap_factor = 0.25
+
+ B, C, T, H, W = z.shape
+ overlap_size = int(tile_sample_min_tsize * (1 - tile_overlap_factor))
+ blend_extent = int(tile_latent_min_tsize * tile_overlap_factor)
+ t_limit = tile_latent_min_tsize - blend_extent
+
+ row = []
+ for i in range(0, T, overlap_size):
+ tile = z[:, :, i: i + tile_sample_min_tsize + 1, :, :]
+ if self.use_hw_tiling:
+ tile = self._hw_tiled_encode(tile, return_dict)
+ else:
+ tile = self._encode(tile)
+ if i > 0:
+ tile = tile[:, :, 1:, :, :]
+ row.append(tile)
+ result_row = []
+ for i, tile in enumerate(row):
+ if i > 0:
+ tile = self.blend_z(row[i - 1], tile, blend_extent)
+ result_row.append(tile[:, :, :t_limit, :, :])
+ else:
+ result_row.append(tile[:, :, :t_limit + 1, :, :])
+
+ moments = torch.cat(result_row, dim=2)
+
+
+ else:
+ moments = (
+ self._hw_tiled_encode(z, return_dict)
+ if self.use_hw_tiling and z.shape[2] > 1
+ else self._encode(z)
+ )
+
+ posterior = DiagonalGaussianDistribution(moments)
+ if not return_dict:
+ return (posterior,)
+
+ return AutoencoderKLOutput(latent_dist=posterior)
+
+ def _normalize_latent_channels(self, z: torch.FloatTensor) -> torch.FloatTensor:
+ if isinstance(self.latent_norm_out, nn.BatchNorm3d):
+ _, c, _, _, _ = z.shape
+ z = torch.cat(
+ [
+ self.latent_norm_out(z[:, : c // 2, :, :, :]),
+ z[:, c // 2 :, :, :, :],
+ ],
+ dim=1,
+ )
+ elif isinstance(self.latent_norm_out, nn.BatchNorm2d):
+ raise NotImplementedError("BatchNorm2d not supported")
+ return z
+
+ def _unnormalize_latent_channels(self, z: torch.FloatTensor) -> torch.FloatTensor:
+ if isinstance(self.latent_norm_out, nn.BatchNorm3d):
+ running_mean = self.latent_norm_out.running_mean.view(1, -1, 1, 1, 1)
+ running_var = self.latent_norm_out.running_var.view(1, -1, 1, 1, 1)
+ eps = self.latent_norm_out.eps
+
+ z = z * torch.sqrt(running_var + eps) + running_mean
+ elif isinstance(self.latent_norm_out, nn.BatchNorm3d):
+ raise NotImplementedError("BatchNorm2d not supported")
+ return z
+
+ def _encode(self, x: torch.FloatTensor) -> AutoencoderKLOutput:
+ h = self.encoder(x)
+ moments = self.quant_conv(h)
+ moments = self._normalize_latent_channels(moments)
+ return moments
+
+ def _decode(
+ self,
+ z: torch.FloatTensor,
+ target_shape=None,
+ timestep: Optional[torch.Tensor] = None,
+ ) -> Union[DecoderOutput, torch.FloatTensor]:
+ z = self._unnormalize_latent_channels(z)
+ z = self.post_quant_conv(z)
+ if "timestep" in self.decoder_params:
+ dec = self.decoder(z, target_shape=target_shape, timestep=timestep)
+ else:
+ dec = self.decoder(z, target_shape=target_shape)
+ return dec
+
+ def decode(
+ self,
+ z: torch.FloatTensor,
+ return_dict: bool = True,
+ target_shape=None,
+ timestep: Optional[torch.Tensor] = None,
+ ) -> Union[DecoderOutput, torch.FloatTensor]:
+ assert target_shape is not None, "target_shape must be provided for decoding"
+ if self.use_z_tiling and z.shape[2] > (self.z_sample_size + 1) > 1:
+ # Split z into overlapping tiles and decode them separately.
+ tile_latent_min_tsize = self.z_sample_size
+ tile_sample_min_tsize = tile_latent_min_tsize * 8
+ tile_overlap_factor = 0.25
+
+ B, C, T, H, W = z.shape
+ overlap_size = int(tile_latent_min_tsize * (1 - tile_overlap_factor))
+ blend_extent = int(tile_sample_min_tsize * tile_overlap_factor)
+ t_limit = tile_sample_min_tsize - blend_extent
+
+ row = []
+ for i in range(0, T, overlap_size):
+ tile = z[:, :, i: i + tile_latent_min_tsize + 1, :, :]
+ target_shape_split = list(target_shape)
+ target_shape_split[2] = tile.shape[2] * 8
+ if self.use_hw_tiling:
+ decoded = self._hw_tiled_decode(tile, target_shape, timestep)
+ else:
+ decoded = self._decode(tile, target_shape=target_shape, timestep=timestep)
+
+ if i > 0:
+ decoded = decoded[:, :, 1:, :, :]
+ row.append(decoded.to(torch.float16).cpu())
+ decoded = None
+ result_row = []
+ for i, tile in enumerate(row):
+ if i > 0:
+ tile = self.blend_z(row[i - 1], tile, blend_extent)
+ result_row.append(tile[:, :, :t_limit, :, :])
+ else:
+ result_row.append(tile[:, :, :t_limit + 1, :, :])
+
+ dec = torch.cat(result_row, dim=2)
+ if not return_dict:
+ return (dec,)
+
+ return DecoderOutput(sample=dec)
+ else:
+ decoded = (
+ self._hw_tiled_decode(z, target_shape, timestep)
+ if self.use_hw_tiling
+ else self._decode(z, target_shape=target_shape, timestep=timestep)
+ )
+
+ if not return_dict:
+ return (decoded,)
+
+ return DecoderOutput(sample=decoded)
+
+ def forward(
+ self,
+ sample: torch.FloatTensor,
+ sample_posterior: bool = False,
+ return_dict: bool = True,
+ generator: Optional[torch.Generator] = None,
+ ) -> Union[DecoderOutput, torch.FloatTensor]:
+ r"""
+ Args:
+ sample (`torch.FloatTensor`): Input sample.
+ sample_posterior (`bool`, *optional*, defaults to `False`):
+ Whether to sample from the posterior.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether to return a [`DecoderOutput`] instead of a plain tuple.
+ generator (`torch.Generator`, *optional*):
+ Generator used to sample from the posterior.
+ """
+ x = sample
+ posterior = self.encode(x).latent_dist
+ if sample_posterior:
+ z = posterior.sample(generator=generator)
+ else:
+ z = posterior.mode()
+ dec = self.decode(z, target_shape=sample.shape).sample
+
+ if not return_dict:
+ return (dec,)
+
+ return DecoderOutput(sample=dec)
diff --git a/ltx_video/models/autoencoders/vae_encode.py b/ltx_video/models/autoencoders/vae_encode.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7d2476f4362b0203804507c4826e01d00b46f99
--- /dev/null
+++ b/ltx_video/models/autoencoders/vae_encode.py
@@ -0,0 +1,247 @@
+from typing import Tuple
+import torch
+from diffusers import AutoencoderKL
+from einops import rearrange
+from torch import Tensor
+
+
+from ltx_video.models.autoencoders.causal_video_autoencoder import (
+ CausalVideoAutoencoder,
+)
+from ltx_video.models.autoencoders.video_autoencoder import (
+ Downsample3D,
+ VideoAutoencoder,
+)
+
+try:
+ import torch_xla.core.xla_model as xm
+except ImportError:
+ xm = None
+
+
+def vae_encode(
+ media_items: Tensor,
+ vae: AutoencoderKL,
+ split_size: int = 1,
+ vae_per_channel_normalize=False,
+) -> Tensor:
+ """
+ Encodes media items (images or videos) into latent representations using a specified VAE model.
+ The function supports processing batches of images or video frames and can handle the processing
+ in smaller sub-batches if needed.
+
+ Args:
+ media_items (Tensor): A torch Tensor containing the media items to encode. The expected
+ shape is (batch_size, channels, height, width) for images or (batch_size, channels,
+ frames, height, width) for videos.
+ vae (AutoencoderKL): An instance of the `AutoencoderKL` class from the `diffusers` library,
+ pre-configured and loaded with the appropriate model weights.
+ split_size (int, optional): The number of sub-batches to split the input batch into for encoding.
+ If set to more than 1, the input media items are processed in smaller batches according to
+ this value. Defaults to 1, which processes all items in a single batch.
+
+ Returns:
+ Tensor: A torch Tensor of the encoded latent representations. The shape of the tensor is adjusted
+ to match the input shape, scaled by the model's configuration.
+
+ Examples:
+ >>> import torch
+ >>> from diffusers import AutoencoderKL
+ >>> vae = AutoencoderKL.from_pretrained('your-model-name')
+ >>> images = torch.rand(10, 3, 8 256, 256) # Example tensor with 10 videos of 8 frames.
+ >>> latents = vae_encode(images, vae)
+ >>> print(latents.shape) # Output shape will depend on the model's latent configuration.
+
+ Note:
+ In case of a video, the function encodes the media item frame-by frame.
+ """
+ is_video_shaped = media_items.dim() == 5
+ batch_size, channels = media_items.shape[0:2]
+
+ if channels != 3:
+ raise ValueError(f"Expects tensors with 3 channels, got {channels}.")
+
+ if is_video_shaped and not isinstance(
+ vae, (VideoAutoencoder, CausalVideoAutoencoder)
+ ):
+ media_items = rearrange(media_items, "b c n h w -> (b n) c h w")
+ if split_size > 1:
+ if len(media_items) % split_size != 0:
+ raise ValueError(
+ "Error: The batch size must be divisible by 'train.vae_bs_split"
+ )
+ encode_bs = len(media_items) // split_size
+ # latents = [vae.encode(image_batch).latent_dist.sample() for image_batch in media_items.split(encode_bs)]
+ latents = []
+ if media_items.device.type == "xla":
+ xm.mark_step()
+ for image_batch in media_items.split(encode_bs):
+ latents.append(vae.encode(image_batch).latent_dist.sample())
+ if media_items.device.type == "xla":
+ xm.mark_step()
+ latents = torch.cat(latents, dim=0)
+ else:
+ latents = vae.encode(media_items).latent_dist.sample()
+
+ latents = normalize_latents(latents, vae, vae_per_channel_normalize)
+ if is_video_shaped and not isinstance(
+ vae, (VideoAutoencoder, CausalVideoAutoencoder)
+ ):
+ latents = rearrange(latents, "(b n) c h w -> b c n h w", b=batch_size)
+ return latents
+
+
+def vae_decode(
+ latents: Tensor,
+ vae: AutoencoderKL,
+ is_video: bool = True,
+ split_size: int = 1,
+ vae_per_channel_normalize=False,
+ timestep=None,
+) -> Tensor:
+ is_video_shaped = latents.dim() == 5
+ batch_size = latents.shape[0]
+
+ if is_video_shaped and not isinstance(
+ vae, (VideoAutoencoder, CausalVideoAutoencoder)
+ ):
+ latents = rearrange(latents, "b c n h w -> (b n) c h w")
+ if split_size > 1:
+ if len(latents) % split_size != 0:
+ raise ValueError(
+ "Error: The batch size must be divisible by 'train.vae_bs_split"
+ )
+ encode_bs = len(latents) // split_size
+ image_batch = [
+ _run_decoder(
+ latent_batch, vae, is_video, vae_per_channel_normalize, timestep
+ )
+ for latent_batch in latents.split(encode_bs)
+ ]
+ images = torch.cat(image_batch, dim=0)
+ else:
+ images = _run_decoder(
+ latents, vae, is_video, vae_per_channel_normalize, timestep
+ )
+
+ if is_video_shaped and not isinstance(
+ vae, (VideoAutoencoder, CausalVideoAutoencoder)
+ ):
+ images = rearrange(images, "(b n) c h w -> b c n h w", b=batch_size)
+ return images
+
+
+def _run_decoder(
+ latents: Tensor,
+ vae: AutoencoderKL,
+ is_video: bool,
+ vae_per_channel_normalize=False,
+ timestep=None,
+) -> Tensor:
+ if isinstance(vae, (VideoAutoencoder, CausalVideoAutoencoder)):
+ *_, fl, hl, wl = latents.shape
+ temporal_scale, spatial_scale, _ = get_vae_size_scale_factor(vae)
+ latents = latents.to(vae.dtype)
+ vae_decode_kwargs = {}
+ if timestep is not None:
+ vae_decode_kwargs["timestep"] = timestep
+ image = vae.decode(
+ un_normalize_latents(latents, vae, vae_per_channel_normalize),
+ return_dict=False,
+ target_shape=(
+ 1,
+ 3,
+ fl * temporal_scale if is_video else 1,
+ hl * spatial_scale,
+ wl * spatial_scale,
+ ),
+ **vae_decode_kwargs,
+ )[0]
+ else:
+ image = vae.decode(
+ un_normalize_latents(latents, vae, vae_per_channel_normalize),
+ return_dict=False,
+ )[0]
+ return image
+
+
+def get_vae_size_scale_factor(vae: AutoencoderKL) -> float:
+ if isinstance(vae, CausalVideoAutoencoder):
+ spatial = vae.spatial_downscale_factor
+ temporal = vae.temporal_downscale_factor
+ else:
+ down_blocks = len(
+ [
+ block
+ for block in vae.encoder.down_blocks
+ if isinstance(block.downsample, Downsample3D)
+ ]
+ )
+ spatial = vae.config.patch_size * 2**down_blocks
+ temporal = (
+ vae.config.patch_size_t * 2**down_blocks
+ if isinstance(vae, VideoAutoencoder)
+ else 1
+ )
+
+ return (temporal, spatial, spatial)
+
+
+def latent_to_pixel_coords(
+ latent_coords: Tensor, vae: AutoencoderKL, causal_fix: bool = False
+) -> Tensor:
+ """
+ Converts latent coordinates to pixel coordinates by scaling them according to the VAE's
+ configuration.
+
+ Args:
+ latent_coords (Tensor): A tensor of shape [batch_size, 3, num_latents]
+ containing the latent corner coordinates of each token.
+ vae (AutoencoderKL): The VAE model
+ causal_fix (bool): Whether to take into account the different temporal scale
+ of the first frame. Default = False for backwards compatibility.
+ Returns:
+ Tensor: A tensor of pixel coordinates corresponding to the input latent coordinates.
+ """
+
+ scale_factors = get_vae_size_scale_factor(vae)
+ causal_fix = isinstance(vae, CausalVideoAutoencoder) and causal_fix
+ pixel_coords = latent_to_pixel_coords_from_factors(
+ latent_coords, scale_factors, causal_fix
+ )
+ return pixel_coords
+
+
+def latent_to_pixel_coords_from_factors(
+ latent_coords: Tensor, scale_factors: Tuple, causal_fix: bool = False
+) -> Tensor:
+ pixel_coords = (
+ latent_coords
+ * torch.tensor(scale_factors, device=latent_coords.device)[None, :, None]
+ )
+ if causal_fix:
+ # Fix temporal scale for first frame to 1 due to causality
+ pixel_coords[:, 0] = (pixel_coords[:, 0] + 1 - scale_factors[0]).clamp(min=0)
+ return pixel_coords
+
+
+def normalize_latents(
+ latents: Tensor, vae: AutoencoderKL, vae_per_channel_normalize: bool = False
+) -> Tensor:
+ return (
+ (latents - vae.mean_of_means.to(latents.dtype).to(latents.device).view(1, -1, 1, 1, 1))
+ / vae.std_of_means.to(latents.dtype).to(latents.device).view(1, -1, 1, 1, 1)
+ if vae_per_channel_normalize
+ else latents * vae.config.scaling_factor
+ )
+
+
+def un_normalize_latents(
+ latents: Tensor, vae: AutoencoderKL, vae_per_channel_normalize: bool = False
+) -> Tensor:
+ return (
+ latents * vae.std_of_means.to(latents.dtype).to(latents.device).view(1, -1, 1, 1, 1)
+ + vae.mean_of_means.to(latents.dtype).to(latents.device).view(1, -1, 1, 1, 1)
+ if vae_per_channel_normalize
+ else latents / vae.config.scaling_factor
+ )
diff --git a/ltx_video/models/autoencoders/video_autoencoder.py b/ltx_video/models/autoencoders/video_autoencoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c7926c1d3afb8188221b2e569aaaf89f7271bce
--- /dev/null
+++ b/ltx_video/models/autoencoders/video_autoencoder.py
@@ -0,0 +1,1045 @@
+import json
+import os
+from functools import partial
+from types import SimpleNamespace
+from typing import Any, Mapping, Optional, Tuple, Union
+
+import torch
+from einops import rearrange
+from torch import nn
+from torch.nn import functional
+
+from diffusers.utils import logging
+
+from ltx_video.utils.torch_utils import Identity
+from ltx_video.models.autoencoders.conv_nd_factory import make_conv_nd, make_linear_nd
+from ltx_video.models.autoencoders.pixel_norm import PixelNorm
+from ltx_video.models.autoencoders.vae import AutoencoderKLWrapper
+
+logger = logging.get_logger(__name__)
+
+
+class VideoAutoencoder(AutoencoderKLWrapper):
+ @classmethod
+ def from_pretrained(
+ cls,
+ pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+ *args,
+ **kwargs,
+ ):
+ config_local_path = pretrained_model_name_or_path / "config.json"
+ config = cls.load_config(config_local_path, **kwargs)
+ video_vae = cls.from_config(config)
+ video_vae.to(kwargs["torch_dtype"])
+
+ model_local_path = pretrained_model_name_or_path / "autoencoder.pth"
+ ckpt_state_dict = torch.load(model_local_path)
+ video_vae.load_state_dict(ckpt_state_dict)
+
+ statistics_local_path = (
+ pretrained_model_name_or_path / "per_channel_statistics.json"
+ )
+ if statistics_local_path.exists():
+ with open(statistics_local_path, "r") as file:
+ data = json.load(file)
+ transposed_data = list(zip(*data["data"]))
+ data_dict = {
+ col: torch.tensor(vals)
+ for col, vals in zip(data["columns"], transposed_data)
+ }
+ video_vae.register_buffer("std_of_means", data_dict["std-of-means"])
+ video_vae.register_buffer(
+ "mean_of_means",
+ data_dict.get(
+ "mean-of-means", torch.zeros_like(data_dict["std-of-means"])
+ ),
+ )
+
+ return video_vae
+
+ @staticmethod
+ def from_config(config):
+ assert (
+ config["_class_name"] == "VideoAutoencoder"
+ ), "config must have _class_name=VideoAutoencoder"
+ if isinstance(config["dims"], list):
+ config["dims"] = tuple(config["dims"])
+
+ assert config["dims"] in [2, 3, (2, 1)], "dims must be 2, 3 or (2, 1)"
+
+ double_z = config.get("double_z", True)
+ latent_log_var = config.get(
+ "latent_log_var", "per_channel" if double_z else "none"
+ )
+ use_quant_conv = config.get("use_quant_conv", True)
+
+ if use_quant_conv and latent_log_var == "uniform":
+ raise ValueError("uniform latent_log_var requires use_quant_conv=False")
+
+ encoder = Encoder(
+ dims=config["dims"],
+ in_channels=config.get("in_channels", 3),
+ out_channels=config["latent_channels"],
+ block_out_channels=config["block_out_channels"],
+ patch_size=config.get("patch_size", 1),
+ latent_log_var=latent_log_var,
+ norm_layer=config.get("norm_layer", "group_norm"),
+ patch_size_t=config.get("patch_size_t", config.get("patch_size", 1)),
+ add_channel_padding=config.get("add_channel_padding", False),
+ )
+
+ decoder = Decoder(
+ dims=config["dims"],
+ in_channels=config["latent_channels"],
+ out_channels=config.get("out_channels", 3),
+ block_out_channels=config["block_out_channels"],
+ patch_size=config.get("patch_size", 1),
+ norm_layer=config.get("norm_layer", "group_norm"),
+ patch_size_t=config.get("patch_size_t", config.get("patch_size", 1)),
+ add_channel_padding=config.get("add_channel_padding", False),
+ )
+
+ dims = config["dims"]
+ return VideoAutoencoder(
+ encoder=encoder,
+ decoder=decoder,
+ latent_channels=config["latent_channels"],
+ dims=dims,
+ use_quant_conv=use_quant_conv,
+ )
+
+ @property
+ def config(self):
+ return SimpleNamespace(
+ _class_name="VideoAutoencoder",
+ dims=self.dims,
+ in_channels=self.encoder.conv_in.in_channels
+ // (self.encoder.patch_size_t * self.encoder.patch_size**2),
+ out_channels=self.decoder.conv_out.out_channels
+ // (self.decoder.patch_size_t * self.decoder.patch_size**2),
+ latent_channels=self.decoder.conv_in.in_channels,
+ block_out_channels=[
+ self.encoder.down_blocks[i].res_blocks[-1].conv1.out_channels
+ for i in range(len(self.encoder.down_blocks))
+ ],
+ scaling_factor=1.0,
+ norm_layer=self.encoder.norm_layer,
+ patch_size=self.encoder.patch_size,
+ latent_log_var=self.encoder.latent_log_var,
+ use_quant_conv=self.use_quant_conv,
+ patch_size_t=self.encoder.patch_size_t,
+ add_channel_padding=self.encoder.add_channel_padding,
+ )
+
+ @property
+ def is_video_supported(self):
+ """
+ Check if the model supports video inputs of shape (B, C, F, H, W). Otherwise, the model only supports 2D images.
+ """
+ return self.dims != 2
+
+ @property
+ def downscale_factor(self):
+ return self.encoder.downsample_factor
+
+ def to_json_string(self) -> str:
+ import json
+
+ return json.dumps(self.config.__dict__)
+
+ def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
+ model_keys = set(name for name, _ in self.named_parameters())
+
+ key_mapping = {
+ ".resnets.": ".res_blocks.",
+ "downsamplers.0": "downsample",
+ "upsamplers.0": "upsample",
+ }
+
+ converted_state_dict = {}
+ for key, value in state_dict.items():
+ for k, v in key_mapping.items():
+ key = key.replace(k, v)
+
+ if "norm" in key and key not in model_keys:
+ logger.info(
+ f"Removing key {key} from state_dict as it is not present in the model"
+ )
+ continue
+
+ converted_state_dict[key] = value
+
+ super().load_state_dict(converted_state_dict, strict=strict)
+
+ def last_layer(self):
+ if hasattr(self.decoder, "conv_out"):
+ if isinstance(self.decoder.conv_out, nn.Sequential):
+ last_layer = self.decoder.conv_out[-1]
+ else:
+ last_layer = self.decoder.conv_out
+ else:
+ last_layer = self.decoder.layers[-1]
+ return last_layer
+
+
+class Encoder(nn.Module):
+ r"""
+ The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
+
+ Args:
+ in_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ out_channels (`int`, *optional*, defaults to 3):
+ The number of output channels.
+ block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+ The number of output channels for each block.
+ layers_per_block (`int`, *optional*, defaults to 2):
+ The number of layers per block.
+ norm_num_groups (`int`, *optional*, defaults to 32):
+ The number of groups for normalization.
+ patch_size (`int`, *optional*, defaults to 1):
+ The patch size to use. Should be a power of 2.
+ norm_layer (`str`, *optional*, defaults to `group_norm`):
+ The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+ latent_log_var (`str`, *optional*, defaults to `per_channel`):
+ The number of channels for the log variance. Can be either `per_channel`, `uniform`, or `none`.
+ """
+
+ def __init__(
+ self,
+ dims: Union[int, Tuple[int, int]] = 3,
+ in_channels: int = 3,
+ out_channels: int = 3,
+ block_out_channels: Tuple[int, ...] = (64,),
+ layers_per_block: int = 2,
+ norm_num_groups: int = 32,
+ patch_size: Union[int, Tuple[int]] = 1,
+ norm_layer: str = "group_norm", # group_norm, pixel_norm
+ latent_log_var: str = "per_channel",
+ patch_size_t: Optional[int] = None,
+ add_channel_padding: Optional[bool] = False,
+ ):
+ super().__init__()
+ self.patch_size = patch_size
+ self.patch_size_t = patch_size_t if patch_size_t is not None else patch_size
+ self.add_channel_padding = add_channel_padding
+ self.layers_per_block = layers_per_block
+ self.norm_layer = norm_layer
+ self.latent_channels = out_channels
+ self.latent_log_var = latent_log_var
+ if add_channel_padding:
+ in_channels = in_channels * self.patch_size**3
+ else:
+ in_channels = in_channels * self.patch_size_t * self.patch_size**2
+ self.in_channels = in_channels
+ output_channel = block_out_channels[0]
+
+ self.conv_in = make_conv_nd(
+ dims=dims,
+ in_channels=in_channels,
+ out_channels=output_channel,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ )
+
+ self.down_blocks = nn.ModuleList([])
+
+ for i in range(len(block_out_channels)):
+ input_channel = output_channel
+ output_channel = block_out_channels[i]
+ is_final_block = i == len(block_out_channels) - 1
+
+ down_block = DownEncoderBlock3D(
+ dims=dims,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ num_layers=self.layers_per_block,
+ add_downsample=not is_final_block and 2**i >= patch_size,
+ resnet_eps=1e-6,
+ downsample_padding=0,
+ resnet_groups=norm_num_groups,
+ norm_layer=norm_layer,
+ )
+ self.down_blocks.append(down_block)
+
+ self.mid_block = UNetMidBlock3D(
+ dims=dims,
+ in_channels=block_out_channels[-1],
+ num_layers=self.layers_per_block,
+ resnet_eps=1e-6,
+ resnet_groups=norm_num_groups,
+ norm_layer=norm_layer,
+ )
+
+ # out
+ if norm_layer == "group_norm":
+ self.conv_norm_out = nn.GroupNorm(
+ num_channels=block_out_channels[-1],
+ num_groups=norm_num_groups,
+ eps=1e-6,
+ )
+ elif norm_layer == "pixel_norm":
+ self.conv_norm_out = PixelNorm()
+ self.conv_act = nn.SiLU()
+
+ conv_out_channels = out_channels
+ if latent_log_var == "per_channel":
+ conv_out_channels *= 2
+ elif latent_log_var == "uniform":
+ conv_out_channels += 1
+ elif latent_log_var != "none":
+ raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
+ self.conv_out = make_conv_nd(
+ dims, block_out_channels[-1], conv_out_channels, 3, padding=1
+ )
+
+ self.gradient_checkpointing = False
+
+ @property
+ def downscale_factor(self):
+ return (
+ 2
+ ** len(
+ [
+ block
+ for block in self.down_blocks
+ if isinstance(block.downsample, Downsample3D)
+ ]
+ )
+ * self.patch_size
+ )
+
+ def forward(
+ self, sample: torch.FloatTensor, return_features=False
+ ) -> torch.FloatTensor:
+ r"""The forward method of the `Encoder` class."""
+
+ downsample_in_time = sample.shape[2] != 1
+
+ # patchify
+ patch_size_t = self.patch_size_t if downsample_in_time else 1
+ sample = patchify(
+ sample,
+ patch_size_hw=self.patch_size,
+ patch_size_t=patch_size_t,
+ add_channel_padding=self.add_channel_padding,
+ )
+
+ sample = self.conv_in(sample)
+
+ checkpoint_fn = (
+ partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+ if self.gradient_checkpointing and self.training
+ else lambda x: x
+ )
+
+ if return_features:
+ features = []
+ for down_block in self.down_blocks:
+ sample = checkpoint_fn(down_block)(
+ sample, downsample_in_time=downsample_in_time
+ )
+ if return_features:
+ features.append(sample)
+
+ sample = checkpoint_fn(self.mid_block)(sample)
+
+ # post-process
+ sample = self.conv_norm_out(sample)
+ sample = self.conv_act(sample)
+ sample = self.conv_out(sample)
+
+ if self.latent_log_var == "uniform":
+ last_channel = sample[:, -1:, ...]
+ num_dims = sample.dim()
+
+ if num_dims == 4:
+ # For shape (B, C, H, W)
+ repeated_last_channel = last_channel.repeat(
+ 1, sample.shape[1] - 2, 1, 1
+ )
+ sample = torch.cat([sample, repeated_last_channel], dim=1)
+ elif num_dims == 5:
+ # For shape (B, C, F, H, W)
+ repeated_last_channel = last_channel.repeat(
+ 1, sample.shape[1] - 2, 1, 1, 1
+ )
+ sample = torch.cat([sample, repeated_last_channel], dim=1)
+ else:
+ raise ValueError(f"Invalid input shape: {sample.shape}")
+
+ if return_features:
+ features.append(sample[:, : self.latent_channels, ...])
+ return sample, features
+ return sample
+
+
+class Decoder(nn.Module):
+ r"""
+ The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
+
+ Args:
+ in_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ out_channels (`int`, *optional*, defaults to 3):
+ The number of output channels.
+ block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+ The number of output channels for each block.
+ layers_per_block (`int`, *optional*, defaults to 2):
+ The number of layers per block.
+ norm_num_groups (`int`, *optional*, defaults to 32):
+ The number of groups for normalization.
+ patch_size (`int`, *optional*, defaults to 1):
+ The patch size to use. Should be a power of 2.
+ norm_layer (`str`, *optional*, defaults to `group_norm`):
+ The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+ """
+
+ def __init__(
+ self,
+ dims,
+ in_channels: int = 3,
+ out_channels: int = 3,
+ block_out_channels: Tuple[int, ...] = (64,),
+ layers_per_block: int = 2,
+ norm_num_groups: int = 32,
+ patch_size: int = 1,
+ norm_layer: str = "group_norm",
+ patch_size_t: Optional[int] = None,
+ add_channel_padding: Optional[bool] = False,
+ ):
+ super().__init__()
+ self.patch_size = patch_size
+ self.patch_size_t = patch_size_t if patch_size_t is not None else patch_size
+ self.add_channel_padding = add_channel_padding
+ self.layers_per_block = layers_per_block
+ if add_channel_padding:
+ out_channels = out_channels * self.patch_size**3
+ else:
+ out_channels = out_channels * self.patch_size_t * self.patch_size**2
+ self.out_channels = out_channels
+
+ self.conv_in = make_conv_nd(
+ dims,
+ in_channels,
+ block_out_channels[-1],
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ )
+
+ self.mid_block = None
+ self.up_blocks = nn.ModuleList([])
+
+ self.mid_block = UNetMidBlock3D(
+ dims=dims,
+ in_channels=block_out_channels[-1],
+ num_layers=self.layers_per_block,
+ resnet_eps=1e-6,
+ resnet_groups=norm_num_groups,
+ norm_layer=norm_layer,
+ )
+
+ reversed_block_out_channels = list(reversed(block_out_channels))
+ output_channel = reversed_block_out_channels[0]
+ for i in range(len(reversed_block_out_channels)):
+ prev_output_channel = output_channel
+ output_channel = reversed_block_out_channels[i]
+
+ is_final_block = i == len(block_out_channels) - 1
+
+ up_block = UpDecoderBlock3D(
+ dims=dims,
+ num_layers=self.layers_per_block + 1,
+ in_channels=prev_output_channel,
+ out_channels=output_channel,
+ add_upsample=not is_final_block
+ and 2 ** (len(block_out_channels) - i - 1) > patch_size,
+ resnet_eps=1e-6,
+ resnet_groups=norm_num_groups,
+ norm_layer=norm_layer,
+ )
+ self.up_blocks.append(up_block)
+
+ if norm_layer == "group_norm":
+ self.conv_norm_out = nn.GroupNorm(
+ num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6
+ )
+ elif norm_layer == "pixel_norm":
+ self.conv_norm_out = PixelNorm()
+
+ self.conv_act = nn.SiLU()
+ self.conv_out = make_conv_nd(
+ dims, block_out_channels[0], out_channels, 3, padding=1
+ )
+
+ self.gradient_checkpointing = False
+
+ def forward(self, sample: torch.FloatTensor, target_shape) -> torch.FloatTensor:
+ r"""The forward method of the `Decoder` class."""
+ assert target_shape is not None, "target_shape must be provided"
+ upsample_in_time = sample.shape[2] < target_shape[2]
+
+ sample = self.conv_in(sample)
+
+ upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+
+ checkpoint_fn = (
+ partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+ if self.gradient_checkpointing and self.training
+ else lambda x: x
+ )
+
+ sample = checkpoint_fn(self.mid_block)(sample)
+ sample = sample.to(upscale_dtype)
+
+ for up_block in self.up_blocks:
+ sample = checkpoint_fn(up_block)(sample, upsample_in_time=upsample_in_time)
+
+ # post-process
+ sample = self.conv_norm_out(sample)
+ sample = self.conv_act(sample)
+ sample = self.conv_out(sample)
+
+ # un-patchify
+ patch_size_t = self.patch_size_t if upsample_in_time else 1
+ sample = unpatchify(
+ sample,
+ patch_size_hw=self.patch_size,
+ patch_size_t=patch_size_t,
+ add_channel_padding=self.add_channel_padding,
+ )
+
+ return sample
+
+
+class DownEncoderBlock3D(nn.Module):
+ def __init__(
+ self,
+ dims: Union[int, Tuple[int, int]],
+ in_channels: int,
+ out_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_groups: int = 32,
+ add_downsample: bool = True,
+ downsample_padding: int = 1,
+ norm_layer: str = "group_norm",
+ ):
+ super().__init__()
+ res_blocks = []
+
+ for i in range(num_layers):
+ in_channels = in_channels if i == 0 else out_channels
+ res_blocks.append(
+ ResnetBlock3D(
+ dims=dims,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ norm_layer=norm_layer,
+ )
+ )
+
+ self.res_blocks = nn.ModuleList(res_blocks)
+
+ if add_downsample:
+ self.downsample = Downsample3D(
+ dims,
+ out_channels,
+ out_channels=out_channels,
+ padding=downsample_padding,
+ )
+ else:
+ self.downsample = Identity()
+
+ def forward(
+ self, hidden_states: torch.FloatTensor, downsample_in_time
+ ) -> torch.FloatTensor:
+ for resnet in self.res_blocks:
+ hidden_states = resnet(hidden_states)
+
+ hidden_states = self.downsample(
+ hidden_states, downsample_in_time=downsample_in_time
+ )
+
+ return hidden_states
+
+
+class UNetMidBlock3D(nn.Module):
+ """
+ A 3D UNet mid-block [`UNetMidBlock3D`] with multiple residual blocks.
+
+ Args:
+ in_channels (`int`): The number of input channels.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+ num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+ resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+ resnet_groups (`int`, *optional*, defaults to 32):
+ The number of groups to use in the group normalization layers of the resnet blocks.
+
+ Returns:
+ `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+ in_channels, height, width)`.
+
+ """
+
+ def __init__(
+ self,
+ dims: Union[int, Tuple[int, int]],
+ in_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_groups: int = 32,
+ norm_layer: str = "group_norm",
+ ):
+ super().__init__()
+ resnet_groups = (
+ resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+ )
+
+ self.res_blocks = nn.ModuleList(
+ [
+ ResnetBlock3D(
+ dims=dims,
+ in_channels=in_channels,
+ out_channels=in_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ norm_layer=norm_layer,
+ )
+ for _ in range(num_layers)
+ ]
+ )
+
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+ for resnet in self.res_blocks:
+ hidden_states = resnet(hidden_states)
+
+ return hidden_states
+
+
+class UpDecoderBlock3D(nn.Module):
+ def __init__(
+ self,
+ dims: Union[int, Tuple[int, int]],
+ in_channels: int,
+ out_channels: int,
+ resolution_idx: Optional[int] = None,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_groups: int = 32,
+ add_upsample: bool = True,
+ norm_layer: str = "group_norm",
+ ):
+ super().__init__()
+ res_blocks = []
+
+ for i in range(num_layers):
+ input_channels = in_channels if i == 0 else out_channels
+
+ res_blocks.append(
+ ResnetBlock3D(
+ dims=dims,
+ in_channels=input_channels,
+ out_channels=out_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ norm_layer=norm_layer,
+ )
+ )
+
+ self.res_blocks = nn.ModuleList(res_blocks)
+
+ if add_upsample:
+ self.upsample = Upsample3D(
+ dims=dims, channels=out_channels, out_channels=out_channels
+ )
+ else:
+ self.upsample = Identity()
+
+ self.resolution_idx = resolution_idx
+
+ def forward(
+ self, hidden_states: torch.FloatTensor, upsample_in_time=True
+ ) -> torch.FloatTensor:
+ for resnet in self.res_blocks:
+ hidden_states = resnet(hidden_states)
+
+ hidden_states = self.upsample(hidden_states, upsample_in_time=upsample_in_time)
+
+ return hidden_states
+
+
+class ResnetBlock3D(nn.Module):
+ r"""
+ A Resnet block.
+
+ Parameters:
+ in_channels (`int`): The number of channels in the input.
+ out_channels (`int`, *optional*, default to be `None`):
+ The number of output channels for the first conv layer. If None, same as `in_channels`.
+ dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+ groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+ eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+ """
+
+ def __init__(
+ self,
+ dims: Union[int, Tuple[int, int]],
+ in_channels: int,
+ out_channels: Optional[int] = None,
+ conv_shortcut: bool = False,
+ dropout: float = 0.0,
+ groups: int = 32,
+ eps: float = 1e-6,
+ norm_layer: str = "group_norm",
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ out_channels = in_channels if out_channels is None else out_channels
+ self.out_channels = out_channels
+ self.use_conv_shortcut = conv_shortcut
+
+ if norm_layer == "group_norm":
+ self.norm1 = torch.nn.GroupNorm(
+ num_groups=groups, num_channels=in_channels, eps=eps, affine=True
+ )
+ elif norm_layer == "pixel_norm":
+ self.norm1 = PixelNorm()
+
+ self.non_linearity = nn.SiLU()
+
+ self.conv1 = make_conv_nd(
+ dims, in_channels, out_channels, kernel_size=3, stride=1, padding=1
+ )
+
+ if norm_layer == "group_norm":
+ self.norm2 = torch.nn.GroupNorm(
+ num_groups=groups, num_channels=out_channels, eps=eps, affine=True
+ )
+ elif norm_layer == "pixel_norm":
+ self.norm2 = PixelNorm()
+
+ self.dropout = torch.nn.Dropout(dropout)
+
+ self.conv2 = make_conv_nd(
+ dims, out_channels, out_channels, kernel_size=3, stride=1, padding=1
+ )
+
+ self.conv_shortcut = (
+ make_linear_nd(
+ dims=dims, in_channels=in_channels, out_channels=out_channels
+ )
+ if in_channels != out_channels
+ else nn.Identity()
+ )
+
+ def forward(
+ self,
+ input_tensor: torch.FloatTensor,
+ ) -> torch.FloatTensor:
+ hidden_states = input_tensor
+
+ hidden_states = self.norm1(hidden_states)
+
+ hidden_states = self.non_linearity(hidden_states)
+
+ hidden_states = self.conv1(hidden_states)
+
+ hidden_states = self.norm2(hidden_states)
+
+ hidden_states = self.non_linearity(hidden_states)
+
+ hidden_states = self.dropout(hidden_states)
+
+ hidden_states = self.conv2(hidden_states)
+
+ input_tensor = self.conv_shortcut(input_tensor)
+
+ output_tensor = input_tensor + hidden_states
+
+ return output_tensor
+
+
+class Downsample3D(nn.Module):
+ def __init__(
+ self,
+ dims,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: int = 3,
+ padding: int = 1,
+ ):
+ super().__init__()
+ stride: int = 2
+ self.padding = padding
+ self.in_channels = in_channels
+ self.dims = dims
+ self.conv = make_conv_nd(
+ dims=dims,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ )
+
+ def forward(self, x, downsample_in_time=True):
+ conv = self.conv
+ if self.padding == 0:
+ if self.dims == 2:
+ padding = (0, 1, 0, 1)
+ else:
+ padding = (0, 1, 0, 1, 0, 1 if downsample_in_time else 0)
+
+ x = functional.pad(x, padding, mode="constant", value=0)
+
+ if self.dims == (2, 1) and not downsample_in_time:
+ return conv(x, skip_time_conv=True)
+
+ return conv(x)
+
+
+class Upsample3D(nn.Module):
+ """
+ An upsampling layer for 3D tensors of shape (B, C, D, H, W).
+
+ :param channels: channels in the inputs and outputs.
+ """
+
+ def __init__(self, dims, channels, out_channels=None):
+ super().__init__()
+ self.dims = dims
+ self.channels = channels
+ self.out_channels = out_channels or channels
+ self.conv = make_conv_nd(
+ dims, channels, out_channels, kernel_size=3, padding=1, bias=True
+ )
+
+ def forward(self, x, upsample_in_time):
+ if self.dims == 2:
+ x = functional.interpolate(
+ x, (x.shape[2] * 2, x.shape[3] * 2), mode="nearest"
+ )
+ else:
+ time_scale_factor = 2 if upsample_in_time else 1
+ # print("before:", x.shape)
+ b, c, d, h, w = x.shape
+ x = rearrange(x, "b c d h w -> (b d) c h w")
+ # height and width interpolate
+ x = functional.interpolate(
+ x, (x.shape[2] * 2, x.shape[3] * 2), mode="nearest"
+ )
+ _, _, h, w = x.shape
+
+ if not upsample_in_time and self.dims == (2, 1):
+ x = rearrange(x, "(b d) c h w -> b c d h w ", b=b, h=h, w=w)
+ return self.conv(x, skip_time_conv=True)
+
+ # Second ** upsampling ** which is essentially treated as a 1D convolution across the 'd' dimension
+ x = rearrange(x, "(b d) c h w -> (b h w) c 1 d", b=b)
+
+ # (b h w) c 1 d
+ new_d = x.shape[-1] * time_scale_factor
+ x = functional.interpolate(x, (1, new_d), mode="nearest")
+ # (b h w) c 1 new_d
+ x = rearrange(
+ x, "(b h w) c 1 new_d -> b c new_d h w", b=b, h=h, w=w, new_d=new_d
+ )
+ # b c d h w
+
+ # x = functional.interpolate(
+ # x, (x.shape[2] * time_scale_factor, x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+ # )
+ # print("after:", x.shape)
+
+ return self.conv(x)
+
+
+def patchify(x, patch_size_hw, patch_size_t=1, add_channel_padding=False):
+ if patch_size_hw == 1 and patch_size_t == 1:
+ return x
+ if x.dim() == 4:
+ x = rearrange(
+ x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size_hw, r=patch_size_hw
+ )
+ elif x.dim() == 5:
+ x = rearrange(
+ x,
+ "b c (f p) (h q) (w r) -> b (c p r q) f h w",
+ p=patch_size_t,
+ q=patch_size_hw,
+ r=patch_size_hw,
+ )
+ else:
+ raise ValueError(f"Invalid input shape: {x.shape}")
+
+ if (
+ (x.dim() == 5)
+ and (patch_size_hw > patch_size_t)
+ and (patch_size_t > 1 or add_channel_padding)
+ ):
+ channels_to_pad = x.shape[1] * (patch_size_hw // patch_size_t) - x.shape[1]
+ padding_zeros = torch.zeros(
+ x.shape[0],
+ channels_to_pad,
+ x.shape[2],
+ x.shape[3],
+ x.shape[4],
+ device=x.device,
+ dtype=x.dtype,
+ )
+ x = torch.cat([padding_zeros, x], dim=1)
+
+ return x
+
+
+def unpatchify(x, patch_size_hw, patch_size_t=1, add_channel_padding=False):
+ if patch_size_hw == 1 and patch_size_t == 1:
+ return x
+
+ if (
+ (x.dim() == 5)
+ and (patch_size_hw > patch_size_t)
+ and (patch_size_t > 1 or add_channel_padding)
+ ):
+ channels_to_keep = int(x.shape[1] * (patch_size_t / patch_size_hw))
+ x = x[:, :channels_to_keep, :, :, :]
+
+ if x.dim() == 4:
+ x = rearrange(
+ x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size_hw, r=patch_size_hw
+ )
+ elif x.dim() == 5:
+ x = rearrange(
+ x,
+ "b (c p r q) f h w -> b c (f p) (h q) (w r)",
+ p=patch_size_t,
+ q=patch_size_hw,
+ r=patch_size_hw,
+ )
+
+ return x
+
+
+def create_video_autoencoder_config(
+ latent_channels: int = 4,
+):
+ config = {
+ "_class_name": "VideoAutoencoder",
+ "dims": (
+ 2,
+ 1,
+ ), # 2 for Conv2, 3 for Conv3d, (2, 1) for Conv2d followed by Conv1d
+ "in_channels": 3, # Number of input color channels (e.g., RGB)
+ "out_channels": 3, # Number of output color channels
+ "latent_channels": latent_channels, # Number of channels in the latent space representation
+ "block_out_channels": [
+ 128,
+ 256,
+ 512,
+ 512,
+ ], # Number of output channels of each encoder / decoder inner block
+ "patch_size": 1,
+ }
+
+ return config
+
+
+def create_video_autoencoder_pathify4x4x4_config(
+ latent_channels: int = 4,
+):
+ config = {
+ "_class_name": "VideoAutoencoder",
+ "dims": (
+ 2,
+ 1,
+ ), # 2 for Conv2, 3 for Conv3d, (2, 1) for Conv2d followed by Conv1d
+ "in_channels": 3, # Number of input color channels (e.g., RGB)
+ "out_channels": 3, # Number of output color channels
+ "latent_channels": latent_channels, # Number of channels in the latent space representation
+ "block_out_channels": [512]
+ * 4, # Number of output channels of each encoder / decoder inner block
+ "patch_size": 4,
+ "latent_log_var": "uniform",
+ }
+
+ return config
+
+
+def create_video_autoencoder_pathify4x4_config(
+ latent_channels: int = 4,
+):
+ config = {
+ "_class_name": "VideoAutoencoder",
+ "dims": 2, # 2 for Conv2, 3 for Conv3d, (2, 1) for Conv2d followed by Conv1d
+ "in_channels": 3, # Number of input color channels (e.g., RGB)
+ "out_channels": 3, # Number of output color channels
+ "latent_channels": latent_channels, # Number of channels in the latent space representation
+ "block_out_channels": [512]
+ * 4, # Number of output channels of each encoder / decoder inner block
+ "patch_size": 4,
+ "norm_layer": "pixel_norm",
+ }
+
+ return config
+
+
+def test_vae_patchify_unpatchify():
+ import torch
+
+ x = torch.randn(2, 3, 8, 64, 64)
+ x_patched = patchify(x, patch_size_hw=4, patch_size_t=4)
+ x_unpatched = unpatchify(x_patched, patch_size_hw=4, patch_size_t=4)
+ assert torch.allclose(x, x_unpatched)
+
+
+def demo_video_autoencoder_forward_backward():
+ # Configuration for the VideoAutoencoder
+ config = create_video_autoencoder_pathify4x4x4_config()
+
+ # Instantiate the VideoAutoencoder with the specified configuration
+ video_autoencoder = VideoAutoencoder.from_config(config)
+
+ print(video_autoencoder)
+
+ # Print the total number of parameters in the video autoencoder
+ total_params = sum(p.numel() for p in video_autoencoder.parameters())
+ print(f"Total number of parameters in VideoAutoencoder: {total_params:,}")
+
+ # Create a mock input tensor simulating a batch of videos
+ # Shape: (batch_size, channels, depth, height, width)
+ # E.g., 4 videos, each with 3 color channels, 16 frames, and 64x64 pixels per frame
+ input_videos = torch.randn(2, 3, 8, 64, 64)
+
+ # Forward pass: encode and decode the input videos
+ latent = video_autoencoder.encode(input_videos).latent_dist.mode()
+ print(f"input shape={input_videos.shape}")
+ print(f"latent shape={latent.shape}")
+ reconstructed_videos = video_autoencoder.decode(
+ latent, target_shape=input_videos.shape
+ ).sample
+
+ print(f"reconstructed shape={reconstructed_videos.shape}")
+
+ # Calculate the loss (e.g., mean squared error)
+ loss = torch.nn.functional.mse_loss(input_videos, reconstructed_videos)
+
+ # Perform backward pass
+ loss.backward()
+
+ print(f"Demo completed with loss: {loss.item()}")
+
+
+# Ensure to call the demo function to execute the forward and backward pass
+if __name__ == "__main__":
+ demo_video_autoencoder_forward_backward()
diff --git a/ltx_video/models/transformers/__init__.py b/ltx_video/models/transformers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ltx_video/models/transformers/attention.py b/ltx_video/models/transformers/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7b4555d683d23fa5d9e5cfbb3345fc7d4c68733
--- /dev/null
+++ b/ltx_video/models/transformers/attention.py
@@ -0,0 +1,1323 @@
+import inspect
+from importlib import import_module
+from typing import Any, Dict, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from diffusers.models.activations import GEGLU, GELU, ApproximateGELU
+from diffusers.models.attention import _chunked_feed_forward
+from diffusers.models.attention_processor import (
+ LoRAAttnAddedKVProcessor,
+ LoRAAttnProcessor,
+ LoRAAttnProcessor2_0,
+ LoRAXFormersAttnProcessor,
+ SpatialNorm,
+)
+from diffusers.models.lora import LoRACompatibleLinear
+from diffusers.models.normalization import RMSNorm
+from diffusers.utils import deprecate, logging
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from einops import rearrange
+from torch import nn
+from wan.modules.attention import pay_attention
+from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
+
+try:
+ from torch_xla.experimental.custom_kernel import flash_attention
+except ImportError:
+ # workaround for automatic tests. Currently this function is manually patched
+ # to the torch_xla lib on setup of container
+ pass
+
+# code adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py
+
+logger = logging.get_logger(__name__)
+
+def reshape_hidden_states(hidden_states, latent_frames):
+ return hidden_states.reshape(hidden_states.shape[0], latent_frames, -1, hidden_states.shape[-1] )
+
+
+def restore_hidden_states_shape(hidden_states):
+ return hidden_states.reshape(hidden_states.shape[0], -1, hidden_states.shape[-1] )
+
+
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+ r"""
+ A basic Transformer block.
+
+ Parameters:
+ dim (`int`): The number of channels in the input and output.
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
+ attention_head_dim (`int`): The number of channels in each head.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+ cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+ num_embeds_ada_norm (:
+ obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+ attention_bias (:
+ obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+ only_cross_attention (`bool`, *optional*):
+ Whether to use only cross-attention layers. In this case two cross attention layers are used.
+ double_self_attention (`bool`, *optional*):
+ Whether to use two self-attention layers. In this case no cross attention layers are used.
+ upcast_attention (`bool`, *optional*):
+ Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+ norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+ Whether to use learnable elementwise affine parameters for normalization.
+ qk_norm (`str`, *optional*, defaults to None):
+ Set to 'layer_norm' or `rms_norm` to perform query and key normalization.
+ adaptive_norm (`str`, *optional*, defaults to `"single_scale_shift"`):
+ The type of adaptive norm to use. Can be `"single_scale_shift"`, `"single_scale"` or "none".
+ standardization_norm (`str`, *optional*, defaults to `"layer_norm"`):
+ The type of pre-normalization to use. Can be `"layer_norm"` or `"rms_norm"`.
+ final_dropout (`bool` *optional*, defaults to False):
+ Whether to apply a final dropout after the last feed-forward layer.
+ attention_type (`str`, *optional*, defaults to `"default"`):
+ The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+ positional_embeddings (`str`, *optional*, defaults to `None`):
+ The type of positional embeddings to apply to.
+ num_positional_embeddings (`int`, *optional*, defaults to `None`):
+ The maximum number of positional embeddings to apply.
+ """
+
+ def __init__(
+ self,
+ dim: int,
+ num_attention_heads: int,
+ attention_head_dim: int,
+ dropout=0.0,
+ cross_attention_dim: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None, # pylint: disable=unused-argument
+ attention_bias: bool = False,
+ only_cross_attention: bool = False,
+ double_self_attention: bool = False,
+ upcast_attention: bool = False,
+ norm_elementwise_affine: bool = True,
+ adaptive_norm: str = "single_scale_shift", # 'single_scale_shift', 'single_scale' or 'none'
+ standardization_norm: str = "layer_norm", # 'layer_norm' or 'rms_norm'
+ norm_eps: float = 1e-5,
+ qk_norm: Optional[str] = None,
+ final_dropout: bool = False,
+ attention_type: str = "default", # pylint: disable=unused-argument
+ ff_inner_dim: Optional[int] = None,
+ ff_bias: bool = True,
+ attention_out_bias: bool = True,
+ use_tpu_flash_attention: bool = False,
+ use_rope: bool = False,
+ ):
+ super().__init__()
+ self.only_cross_attention = only_cross_attention
+ self.use_tpu_flash_attention = use_tpu_flash_attention
+ self.adaptive_norm = adaptive_norm
+
+ assert standardization_norm in ["layer_norm", "rms_norm"]
+ assert adaptive_norm in ["single_scale_shift", "single_scale", "none"]
+
+ make_norm_layer = (
+ nn.LayerNorm if standardization_norm == "layer_norm" else RMSNorm
+ )
+
+ # Define 3 blocks. Each block has its own normalization layer.
+ # 1. Self-Attn
+ self.norm1 = make_norm_layer(
+ dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps
+ )
+
+ self.attn1 = Attention(
+ query_dim=dim,
+ heads=num_attention_heads,
+ dim_head=attention_head_dim,
+ dropout=dropout,
+ bias=attention_bias,
+ cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+ upcast_attention=upcast_attention,
+ out_bias=attention_out_bias,
+ use_tpu_flash_attention=use_tpu_flash_attention,
+ qk_norm=qk_norm,
+ use_rope=use_rope,
+ )
+
+ # 2. Cross-Attn
+ if cross_attention_dim is not None or double_self_attention:
+ self.attn2 = Attention(
+ query_dim=dim,
+ cross_attention_dim=(
+ cross_attention_dim if not double_self_attention else None
+ ),
+ heads=num_attention_heads,
+ dim_head=attention_head_dim,
+ dropout=dropout,
+ bias=attention_bias,
+ upcast_attention=upcast_attention,
+ out_bias=attention_out_bias,
+ use_tpu_flash_attention=use_tpu_flash_attention,
+ qk_norm=qk_norm,
+ use_rope=use_rope,
+ ) # is self-attn if encoder_hidden_states is none
+
+ if adaptive_norm == "none":
+ self.attn2_norm = make_norm_layer(
+ dim, norm_eps, norm_elementwise_affine
+ )
+ else:
+ self.attn2 = None
+ self.attn2_norm = None
+
+ self.norm2 = make_norm_layer(dim, norm_eps, norm_elementwise_affine)
+
+ # 3. Feed-forward
+ self.ff = FeedForward(
+ dim,
+ dropout=dropout,
+ activation_fn=activation_fn,
+ final_dropout=final_dropout,
+ inner_dim=ff_inner_dim,
+ bias=ff_bias,
+ )
+
+ # 5. Scale-shift for PixArt-Alpha.
+ if adaptive_norm != "none":
+ num_ada_params = 4 if adaptive_norm == "single_scale" else 6
+ self.scale_shift_table = nn.Parameter(
+ torch.randn(num_ada_params, dim) / dim**0.5
+ )
+
+ # let chunk size default to None
+ self._chunk_size = None
+ self._chunk_dim = 0
+
+ def set_use_tpu_flash_attention(self):
+ r"""
+ Function sets the flag in this object and propagates down the children. The flag will enforce the usage of TPU
+ attention kernel.
+ """
+ self.use_tpu_flash_attention = True
+ self.attn1.set_use_tpu_flash_attention()
+ self.attn2.set_use_tpu_flash_attention()
+
+ def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+ # Sets chunk feed-forward
+ self._chunk_size = chunk_size
+ self._chunk_dim = dim
+
+ def forward(
+ self,
+ hidden_states: torch.FloatTensor,
+ freqs_cis: Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
+ timestep: Optional[torch.LongTensor] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ class_labels: Optional[torch.LongTensor] = None,
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+ skip_layer_mask: Optional[torch.Tensor] = None,
+ skip_layer_strategy: Optional[SkipLayerStrategy] = None,
+ ) -> torch.FloatTensor:
+ if cross_attention_kwargs is not None:
+ if cross_attention_kwargs.get("scale", None) is not None:
+ logger.warning(
+ "Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored."
+ )
+
+ # Notice that normalization is always applied before the real computation in the following blocks.
+ # 0. Self-Attention
+ batch_size = hidden_states.shape[0]
+ if skip_layer_mask != None and skip_layer_mask.flatten().min() == 1.0:
+ skip_layer_mask = None
+
+ original_hidden_states = hidden_states
+
+ norm_hidden_states = self.norm1(hidden_states)
+
+ # Apply ada_norm_single
+ if self.adaptive_norm in ["single_scale_shift", "single_scale"]:
+ assert timestep.ndim == 3 # [batch, 1 or num_tokens, embedding_dim]
+ num_ada_params = self.scale_shift_table.shape[0]
+ ada_values = self.scale_shift_table[None, None] + timestep.reshape(
+ batch_size, timestep.shape[1], num_ada_params, -1
+ )
+ if self.adaptive_norm == "single_scale_shift":
+ ada_values = ada_values.unsqueeze(-2)
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+ ada_values.unbind(dim=2)
+ )
+ norm_hidden_states = reshape_hidden_states(norm_hidden_states, scale_msa.shape[1])
+ norm_hidden_states *= 1 + scale_msa
+ norm_hidden_states += shift_msa
+ # norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+ norm_hidden_states = restore_hidden_states_shape(norm_hidden_states)
+
+ else:
+ scale_msa, gate_msa, scale_mlp, gate_mlp = ada_values.unbind(dim=2)
+ norm_hidden_states = norm_hidden_states * (1 + scale_msa)
+ elif self.adaptive_norm == "none":
+ scale_msa, gate_msa, scale_mlp, gate_mlp = None, None, None, None
+ else:
+ raise ValueError(f"Unknown adaptive norm type: {self.adaptive_norm}")
+
+ norm_hidden_states = norm_hidden_states.squeeze(
+ 1
+ ) # TODO: Check if this is needed
+
+ # 1. Prepare GLIGEN inputs
+ cross_attention_kwargs = (
+ cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+ )
+ norm_hidden_states_wrapper = [norm_hidden_states]
+ del norm_hidden_states
+ attn_output = self.attn1(
+ norm_hidden_states_wrapper,
+ freqs_cis=freqs_cis,
+ encoder_hidden_states=(
+ encoder_hidden_states if self.only_cross_attention else None
+ ),
+ attention_mask=attention_mask,
+ skip_layer_mask=skip_layer_mask,
+ skip_layer_strategy=skip_layer_strategy,
+ **cross_attention_kwargs,
+ )
+ if gate_msa is not None:
+ attn_output = reshape_hidden_states(attn_output, gate_msa.shape[1])
+ # attn_output = gate_msa * attn_output
+ attn_output *= gate_msa
+ attn_output = restore_hidden_states_shape(attn_output)
+
+ hidden_states += attn_output
+ del attn_output
+ if hidden_states.ndim == 4:
+ hidden_states = hidden_states.squeeze(1)
+
+ # 3. Cross-Attention
+ if self.attn2 is not None:
+ if self.adaptive_norm == "none":
+ attn_input = self.attn2_norm(hidden_states)
+ else:
+ attn_input = hidden_states
+
+ attn_input_wrapper = [attn_input]
+ del attn_input
+
+ attn_output = self.attn2(
+ attn_input_wrapper,
+ freqs_cis=freqs_cis,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=encoder_attention_mask,
+ **cross_attention_kwargs,
+ )
+ hidden_states += attn_output
+ del attn_output
+
+ # 4. Feed-forward
+ norm_hidden_states = self.norm2(hidden_states)
+ if self.adaptive_norm == "single_scale_shift":
+ norm_hidden_states = reshape_hidden_states(norm_hidden_states, scale_mlp.shape[1])
+ # norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+ norm_hidden_states *= 1 + scale_mlp
+ norm_hidden_states += shift_mlp
+ norm_hidden_states = restore_hidden_states_shape(norm_hidden_states)
+ elif self.adaptive_norm == "single_scale":
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp)
+ elif self.adaptive_norm == "none":
+ pass
+ else:
+ raise ValueError(f"Unknown adaptive norm type: {self.adaptive_norm}")
+
+ if self._chunk_size is not None:
+ # "feed_forward_chunk_size" can be used to save memory
+ ff_output = _chunked_feed_forward(
+ self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size
+ )
+ else:
+ h_shape = norm_hidden_states.shape
+ norm_hidden_states = norm_hidden_states.view(-1, h_shape[-1])
+ chunk_size = int(norm_hidden_states.shape[0]/4)
+ chunks =torch.split(norm_hidden_states, chunk_size)
+ for h_chunk in chunks:
+ mlp_chunk = self.ff.net[0](h_chunk)
+ h_chunk[...] = self.ff.net[2](mlp_chunk)
+ del mlp_chunk
+ ff_output = norm_hidden_states.view(h_shape)
+ del norm_hidden_states
+
+ if gate_mlp is not None:
+ ff_output = reshape_hidden_states(ff_output, gate_mlp.shape[1])
+ # ff_output = gate_mlp * ff_output
+ ff_output *= gate_mlp
+ ff_output = restore_hidden_states_shape(ff_output)
+
+ hidden_states = ff_output + hidden_states
+ if hidden_states.ndim == 4:
+ hidden_states = hidden_states.squeeze(1)
+
+ if (
+ skip_layer_mask is not None
+ and skip_layer_strategy == SkipLayerStrategy.TransformerBlock
+ ):
+ skip_layer_mask = skip_layer_mask.view(-1, 1, 1)
+ hidden_states = hidden_states * skip_layer_mask + original_hidden_states * (
+ 1.0 - skip_layer_mask
+ )
+
+ return hidden_states
+
+
+@maybe_allow_in_graph
+class Attention(nn.Module):
+ r"""
+ A cross attention layer.
+
+ Parameters:
+ query_dim (`int`):
+ The number of channels in the query.
+ cross_attention_dim (`int`, *optional*):
+ The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+ heads (`int`, *optional*, defaults to 8):
+ The number of heads to use for multi-head attention.
+ dim_head (`int`, *optional*, defaults to 64):
+ The number of channels in each head.
+ dropout (`float`, *optional*, defaults to 0.0):
+ The dropout probability to use.
+ bias (`bool`, *optional*, defaults to False):
+ Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+ upcast_attention (`bool`, *optional*, defaults to False):
+ Set to `True` to upcast the attention computation to `float32`.
+ upcast_softmax (`bool`, *optional*, defaults to False):
+ Set to `True` to upcast the softmax computation to `float32`.
+ cross_attention_norm (`str`, *optional*, defaults to `None`):
+ The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+ cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
+ The number of groups to use for the group norm in the cross attention.
+ added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+ The number of channels to use for the added key and value projections. If `None`, no projection is used.
+ norm_num_groups (`int`, *optional*, defaults to `None`):
+ The number of groups to use for the group norm in the attention.
+ spatial_norm_dim (`int`, *optional*, defaults to `None`):
+ The number of channels to use for the spatial normalization.
+ out_bias (`bool`, *optional*, defaults to `True`):
+ Set to `True` to use a bias in the output linear layer.
+ scale_qk (`bool`, *optional*, defaults to `True`):
+ Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
+ qk_norm (`str`, *optional*, defaults to None):
+ Set to 'layer_norm' or `rms_norm` to perform query and key normalization.
+ only_cross_attention (`bool`, *optional*, defaults to `False`):
+ Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
+ `added_kv_proj_dim` is not `None`.
+ eps (`float`, *optional*, defaults to 1e-5):
+ An additional value added to the denominator in group normalization that is used for numerical stability.
+ rescale_output_factor (`float`, *optional*, defaults to 1.0):
+ A factor to rescale the output by dividing it with this value.
+ residual_connection (`bool`, *optional*, defaults to `False`):
+ Set to `True` to add the residual connection to the output.
+ _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
+ Set to `True` if the attention block is loaded from a deprecated state dict.
+ processor (`AttnProcessor`, *optional*, defaults to `None`):
+ The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
+ `AttnProcessor` otherwise.
+ """
+
+ def __init__(
+ self,
+ query_dim: int,
+ cross_attention_dim: Optional[int] = None,
+ heads: int = 8,
+ dim_head: int = 64,
+ dropout: float = 0.0,
+ bias: bool = False,
+ upcast_attention: bool = False,
+ upcast_softmax: bool = False,
+ cross_attention_norm: Optional[str] = None,
+ cross_attention_norm_num_groups: int = 32,
+ added_kv_proj_dim: Optional[int] = None,
+ norm_num_groups: Optional[int] = None,
+ spatial_norm_dim: Optional[int] = None,
+ out_bias: bool = True,
+ scale_qk: bool = True,
+ qk_norm: Optional[str] = None,
+ only_cross_attention: bool = False,
+ eps: float = 1e-5,
+ rescale_output_factor: float = 1.0,
+ residual_connection: bool = False,
+ _from_deprecated_attn_block: bool = False,
+ processor: Optional["AttnProcessor"] = None,
+ out_dim: int = None,
+ use_tpu_flash_attention: bool = False,
+ use_rope: bool = False,
+ ):
+ super().__init__()
+ self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+ self.query_dim = query_dim
+ self.use_bias = bias
+ self.is_cross_attention = cross_attention_dim is not None
+ self.cross_attention_dim = (
+ cross_attention_dim if cross_attention_dim is not None else query_dim
+ )
+ self.upcast_attention = upcast_attention
+ self.upcast_softmax = upcast_softmax
+ self.rescale_output_factor = rescale_output_factor
+ self.residual_connection = residual_connection
+ self.dropout = dropout
+ self.fused_projections = False
+ self.out_dim = out_dim if out_dim is not None else query_dim
+ self.use_tpu_flash_attention = use_tpu_flash_attention
+ self.use_rope = use_rope
+
+ # we make use of this private variable to know whether this class is loaded
+ # with an deprecated state dict so that we can convert it on the fly
+ self._from_deprecated_attn_block = _from_deprecated_attn_block
+
+ self.scale_qk = scale_qk
+ self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+
+ if qk_norm is None:
+ self.q_norm = nn.Identity()
+ self.k_norm = nn.Identity()
+ elif qk_norm == "rms_norm":
+ self.q_norm = RMSNorm(dim_head * heads, eps=1e-5)
+ self.k_norm = RMSNorm(dim_head * heads, eps=1e-5)
+ elif qk_norm == "layer_norm":
+ self.q_norm = nn.LayerNorm(dim_head * heads, eps=1e-5)
+ self.k_norm = nn.LayerNorm(dim_head * heads, eps=1e-5)
+ else:
+ raise ValueError(f"Unsupported qk_norm method: {qk_norm}")
+
+ self.heads = out_dim // dim_head if out_dim is not None else heads
+ # for slice_size > 0 the attention score computation
+ # is split across the batch axis to save memory
+ # You can set slice_size with `set_attention_slice`
+ self.sliceable_head_dim = heads
+
+ self.added_kv_proj_dim = added_kv_proj_dim
+ self.only_cross_attention = only_cross_attention
+
+ if self.added_kv_proj_dim is None and self.only_cross_attention:
+ raise ValueError(
+ "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+ )
+
+ if norm_num_groups is not None:
+ self.group_norm = nn.GroupNorm(
+ num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True
+ )
+ else:
+ self.group_norm = None
+
+ if spatial_norm_dim is not None:
+ self.spatial_norm = SpatialNorm(
+ f_channels=query_dim, zq_channels=spatial_norm_dim
+ )
+ else:
+ self.spatial_norm = None
+
+ if cross_attention_norm is None:
+ self.norm_cross = None
+ elif cross_attention_norm == "layer_norm":
+ self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
+ elif cross_attention_norm == "group_norm":
+ if self.added_kv_proj_dim is not None:
+ # The given `encoder_hidden_states` are initially of shape
+ # (batch_size, seq_len, added_kv_proj_dim) before being projected
+ # to (batch_size, seq_len, cross_attention_dim). The norm is applied
+ # before the projection, so we need to use `added_kv_proj_dim` as
+ # the number of channels for the group norm.
+ norm_cross_num_channels = added_kv_proj_dim
+ else:
+ norm_cross_num_channels = self.cross_attention_dim
+
+ self.norm_cross = nn.GroupNorm(
+ num_channels=norm_cross_num_channels,
+ num_groups=cross_attention_norm_num_groups,
+ eps=1e-5,
+ affine=True,
+ )
+ else:
+ raise ValueError(
+ f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
+ )
+
+ linear_cls = nn.Linear
+
+ self.linear_cls = linear_cls
+ self.to_q = linear_cls(query_dim, self.inner_dim, bias=bias)
+
+ if not self.only_cross_attention:
+ # only relevant for the `AddedKVProcessor` classes
+ self.to_k = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias)
+ self.to_v = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias)
+ else:
+ self.to_k = None
+ self.to_v = None
+
+ if self.added_kv_proj_dim is not None:
+ self.add_k_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
+ self.add_v_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
+
+ self.to_out = nn.ModuleList([])
+ self.to_out.append(linear_cls(self.inner_dim, self.out_dim, bias=out_bias))
+ self.to_out.append(nn.Dropout(dropout))
+
+ # set attention processor
+ # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+ # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+ # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+ if processor is None:
+ processor = AttnProcessor2_0()
+ self.set_processor(processor)
+
+ def set_use_tpu_flash_attention(self):
+ r"""
+ Function sets the flag in this object. The flag will enforce the usage of TPU attention kernel.
+ """
+ self.use_tpu_flash_attention = True
+
+ def set_processor(self, processor: "AttnProcessor") -> None:
+ r"""
+ Set the attention processor to use.
+
+ Args:
+ processor (`AttnProcessor`):
+ The attention processor to use.
+ """
+ # if current processor is in `self._modules` and if passed `processor` is not, we need to
+ # pop `processor` from `self._modules`
+ if (
+ hasattr(self, "processor")
+ and isinstance(self.processor, torch.nn.Module)
+ and not isinstance(processor, torch.nn.Module)
+ ):
+ logger.info(
+ f"You are removing possibly trained weights of {self.processor} with {processor}"
+ )
+ self._modules.pop("processor")
+
+ self.processor = processor
+
+ def get_processor(
+ self, return_deprecated_lora: bool = False
+ ) -> "AttentionProcessor": # noqa: F821
+ r"""
+ Get the attention processor in use.
+
+ Args:
+ return_deprecated_lora (`bool`, *optional*, defaults to `False`):
+ Set to `True` to return the deprecated LoRA attention processor.
+
+ Returns:
+ "AttentionProcessor": The attention processor in use.
+ """
+ if not return_deprecated_lora:
+ return self.processor
+
+ # TODO(Sayak, Patrick). The rest of the function is needed to ensure backwards compatible
+ # serialization format for LoRA Attention Processors. It should be deleted once the integration
+ # with PEFT is completed.
+ is_lora_activated = {
+ name: module.lora_layer is not None
+ for name, module in self.named_modules()
+ if hasattr(module, "lora_layer")
+ }
+
+ # 1. if no layer has a LoRA activated we can return the processor as usual
+ if not any(is_lora_activated.values()):
+ return self.processor
+
+ # If doesn't apply LoRA do `add_k_proj` or `add_v_proj`
+ is_lora_activated.pop("add_k_proj", None)
+ is_lora_activated.pop("add_v_proj", None)
+ # 2. else it is not posssible that only some layers have LoRA activated
+ if not all(is_lora_activated.values()):
+ raise ValueError(
+ f"Make sure that either all layers or no layers have LoRA activated, but have {is_lora_activated}"
+ )
+
+ # 3. And we need to merge the current LoRA layers into the corresponding LoRA attention processor
+ non_lora_processor_cls_name = self.processor.__class__.__name__
+ lora_processor_cls = getattr(
+ import_module(__name__), "LoRA" + non_lora_processor_cls_name
+ )
+
+ hidden_size = self.inner_dim
+
+ # now create a LoRA attention processor from the LoRA layers
+ if lora_processor_cls in [
+ LoRAAttnProcessor,
+ LoRAAttnProcessor2_0,
+ LoRAXFormersAttnProcessor,
+ ]:
+ kwargs = {
+ "cross_attention_dim": self.cross_attention_dim,
+ "rank": self.to_q.lora_layer.rank,
+ "network_alpha": self.to_q.lora_layer.network_alpha,
+ "q_rank": self.to_q.lora_layer.rank,
+ "q_hidden_size": self.to_q.lora_layer.out_features,
+ "k_rank": self.to_k.lora_layer.rank,
+ "k_hidden_size": self.to_k.lora_layer.out_features,
+ "v_rank": self.to_v.lora_layer.rank,
+ "v_hidden_size": self.to_v.lora_layer.out_features,
+ "out_rank": self.to_out[0].lora_layer.rank,
+ "out_hidden_size": self.to_out[0].lora_layer.out_features,
+ }
+
+ if hasattr(self.processor, "attention_op"):
+ kwargs["attention_op"] = self.processor.attention_op
+
+ lora_processor = lora_processor_cls(hidden_size, **kwargs)
+ lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
+ lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
+ lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
+ lora_processor.to_out_lora.load_state_dict(
+ self.to_out[0].lora_layer.state_dict()
+ )
+ elif lora_processor_cls == LoRAAttnAddedKVProcessor:
+ lora_processor = lora_processor_cls(
+ hidden_size,
+ cross_attention_dim=self.add_k_proj.weight.shape[0],
+ rank=self.to_q.lora_layer.rank,
+ network_alpha=self.to_q.lora_layer.network_alpha,
+ )
+ lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
+ lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
+ lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
+ lora_processor.to_out_lora.load_state_dict(
+ self.to_out[0].lora_layer.state_dict()
+ )
+
+ # only save if used
+ if self.add_k_proj.lora_layer is not None:
+ lora_processor.add_k_proj_lora.load_state_dict(
+ self.add_k_proj.lora_layer.state_dict()
+ )
+ lora_processor.add_v_proj_lora.load_state_dict(
+ self.add_v_proj.lora_layer.state_dict()
+ )
+ else:
+ lora_processor.add_k_proj_lora = None
+ lora_processor.add_v_proj_lora = None
+ else:
+ raise ValueError(f"{lora_processor_cls} does not exist.")
+
+ return lora_processor
+
+ def forward(
+ self,
+ hidden_states: torch.FloatTensor,
+ freqs_cis: Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ skip_layer_mask: Optional[torch.Tensor] = None,
+ skip_layer_strategy: Optional[SkipLayerStrategy] = None,
+ **cross_attention_kwargs,
+ ) -> torch.Tensor:
+ r"""
+ The forward method of the `Attention` class.
+
+ Args:
+ hidden_states (`torch.Tensor`):
+ The hidden states of the query.
+ encoder_hidden_states (`torch.Tensor`, *optional*):
+ The hidden states of the encoder.
+ attention_mask (`torch.Tensor`, *optional*):
+ The attention mask to use. If `None`, no mask is applied.
+ skip_layer_mask (`torch.Tensor`, *optional*):
+ The skip layer mask to use. If `None`, no mask is applied.
+ skip_layer_strategy (`SkipLayerStrategy`, *optional*, defaults to `None`):
+ Controls which layers to skip for spatiotemporal guidance.
+ **cross_attention_kwargs:
+ Additional keyword arguments to pass along to the cross attention.
+
+ Returns:
+ `torch.Tensor`: The output of the attention layer.
+ """
+ # The `Attention` class can call different attention processors / attention functions
+ # here we simply pass along all tensors to the selected processor class
+ # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+
+ attn_parameters = set(
+ inspect.signature(self.processor.__call__).parameters.keys()
+ )
+ unused_kwargs = [
+ k for k, _ in cross_attention_kwargs.items() if k not in attn_parameters
+ ]
+ if len(unused_kwargs) > 0:
+ logger.warning(
+ f"cross_attention_kwargs {unused_kwargs} are not expected by"
+ f" {self.processor.__class__.__name__} and will be ignored."
+ )
+ cross_attention_kwargs = {
+ k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters
+ }
+
+ return self.processor(
+ self,
+ hidden_states,
+ freqs_cis=freqs_cis,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=attention_mask,
+ skip_layer_mask=skip_layer_mask,
+ skip_layer_strategy=skip_layer_strategy,
+ **cross_attention_kwargs,
+ )
+
+ def batch_to_head_dim(self, tensor: torch.Tensor) -> torch.Tensor:
+ r"""
+ Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size // heads, seq_len, dim * heads]`. `heads`
+ is the number of heads initialized while constructing the `Attention` class.
+
+ Args:
+ tensor (`torch.Tensor`): The tensor to reshape.
+
+ Returns:
+ `torch.Tensor`: The reshaped tensor.
+ """
+ head_size = self.heads
+ batch_size, seq_len, dim = tensor.shape
+ tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+ tensor = tensor.permute(0, 2, 1, 3).reshape(
+ batch_size // head_size, seq_len, dim * head_size
+ )
+ return tensor
+
+ def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) -> torch.Tensor:
+ r"""
+ Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size, seq_len, heads, dim // heads]` `heads` is
+ the number of heads initialized while constructing the `Attention` class.
+
+ Args:
+ tensor (`torch.Tensor`): The tensor to reshape.
+ out_dim (`int`, *optional*, defaults to `3`): The output dimension of the tensor. If `3`, the tensor is
+ reshaped to `[batch_size * heads, seq_len, dim // heads]`.
+
+ Returns:
+ `torch.Tensor`: The reshaped tensor.
+ """
+
+ head_size = self.heads
+ if tensor.ndim == 3:
+ batch_size, seq_len, dim = tensor.shape
+ extra_dim = 1
+ else:
+ batch_size, extra_dim, seq_len, dim = tensor.shape
+ tensor = tensor.reshape(
+ batch_size, seq_len * extra_dim, head_size, dim // head_size
+ )
+ tensor = tensor.permute(0, 2, 1, 3)
+
+ if out_dim == 3:
+ tensor = tensor.reshape(
+ batch_size * head_size, seq_len * extra_dim, dim // head_size
+ )
+
+ return tensor
+
+ def get_attention_scores(
+ self,
+ query: torch.Tensor,
+ key: torch.Tensor,
+ attention_mask: torch.Tensor = None,
+ ) -> torch.Tensor:
+ r"""
+ Compute the attention scores.
+
+ Args:
+ query (`torch.Tensor`): The query tensor.
+ key (`torch.Tensor`): The key tensor.
+ attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
+
+ Returns:
+ `torch.Tensor`: The attention probabilities/scores.
+ """
+ dtype = query.dtype
+ if self.upcast_attention:
+ query = query.float()
+ key = key.float()
+
+ if attention_mask is None:
+ baddbmm_input = torch.empty(
+ query.shape[0],
+ query.shape[1],
+ key.shape[1],
+ dtype=query.dtype,
+ device=query.device,
+ )
+ beta = 0
+ else:
+ baddbmm_input = attention_mask
+ beta = 1
+
+ attention_scores = torch.baddbmm(
+ baddbmm_input,
+ query,
+ key.transpose(-1, -2),
+ beta=beta,
+ alpha=self.scale,
+ )
+ del baddbmm_input
+
+ if self.upcast_softmax:
+ attention_scores = attention_scores.float()
+
+ attention_probs = attention_scores.softmax(dim=-1)
+ del attention_scores
+
+ attention_probs = attention_probs.to(dtype)
+
+ return attention_probs
+
+ def prepare_attention_mask(
+ self,
+ attention_mask: torch.Tensor,
+ target_length: int,
+ batch_size: int,
+ out_dim: int = 3,
+ ) -> torch.Tensor:
+ r"""
+ Prepare the attention mask for the attention computation.
+
+ Args:
+ attention_mask (`torch.Tensor`):
+ The attention mask to prepare.
+ target_length (`int`):
+ The target length of the attention mask. This is the length of the attention mask after padding.
+ batch_size (`int`):
+ The batch size, which is used to repeat the attention mask.
+ out_dim (`int`, *optional*, defaults to `3`):
+ The output dimension of the attention mask. Can be either `3` or `4`.
+
+ Returns:
+ `torch.Tensor`: The prepared attention mask.
+ """
+ head_size = self.heads
+ if attention_mask is None:
+ return attention_mask
+
+ current_length: int = attention_mask.shape[-1]
+ if current_length != target_length:
+ if attention_mask.device.type == "mps":
+ # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+ # Instead, we can manually construct the padding tensor.
+ padding_shape = (
+ attention_mask.shape[0],
+ attention_mask.shape[1],
+ target_length,
+ )
+ padding = torch.zeros(
+ padding_shape,
+ dtype=attention_mask.dtype,
+ device=attention_mask.device,
+ )
+ attention_mask = torch.cat([attention_mask, padding], dim=2)
+ else:
+ # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+ # we want to instead pad by (0, remaining_length), where remaining_length is:
+ # remaining_length: int = target_length - current_length
+ # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+ attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+
+ if out_dim == 3:
+ if attention_mask.shape[0] < batch_size * head_size:
+ attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+ elif out_dim == 4:
+ attention_mask = attention_mask.unsqueeze(1)
+ attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+
+ return attention_mask
+
+ def norm_encoder_hidden_states(
+ self, encoder_hidden_states: torch.Tensor
+ ) -> torch.Tensor:
+ r"""
+ Normalize the encoder hidden states. Requires `self.norm_cross` to be specified when constructing the
+ `Attention` class.
+
+ Args:
+ encoder_hidden_states (`torch.Tensor`): Hidden states of the encoder.
+
+ Returns:
+ `torch.Tensor`: The normalized encoder hidden states.
+ """
+ assert (
+ self.norm_cross is not None
+ ), "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
+
+ if isinstance(self.norm_cross, nn.LayerNorm):
+ encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+ elif isinstance(self.norm_cross, nn.GroupNorm):
+ # Group norm norms along the channels dimension and expects
+ # input to be in the shape of (N, C, *). In this case, we want
+ # to norm along the hidden dimension, so we need to move
+ # (batch_size, sequence_length, hidden_size) ->
+ # (batch_size, hidden_size, sequence_length)
+ encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+ encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+ encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+ else:
+ assert False
+
+ return encoder_hidden_states
+
+ @staticmethod
+ def apply_rotary_emb(
+ input_tensor: torch.Tensor,
+ freqs_cis: Tuple[torch.FloatTensor, torch.FloatTensor],
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ cos_freqs = freqs_cis[0]
+ sin_freqs = freqs_cis[1]
+
+ t_dup = rearrange(input_tensor, "... (d r) -> ... d r", r=2)
+ t1, t2 = t_dup.unbind(dim=-1)
+ t_dup = torch.stack((-t2, t1), dim=-1)
+ input_tensor_rot = rearrange(t_dup, "... d r -> ... (d r)")
+
+ out = input_tensor * cos_freqs + input_tensor_rot * sin_freqs
+
+ return out
+
+
+class AttnProcessor2_0:
+ r"""
+ Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+ """
+
+ def __init__(self):
+ pass
+
+ def __call__(
+ self,
+ attn: Attention,
+ hidden_states_wrapper: torch.FloatTensor,
+ freqs_cis: Tuple[torch.FloatTensor, torch.FloatTensor],
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ temb: Optional[torch.FloatTensor] = None,
+ skip_layer_mask: Optional[torch.FloatTensor] = None,
+ skip_layer_strategy: Optional[SkipLayerStrategy] = None,
+ *args,
+ **kwargs,
+ ) -> torch.FloatTensor:
+ if len(args) > 0 or kwargs.get("scale", None) is not None:
+ deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+ deprecate("scale", "1.0.0", deprecation_message)
+ hidden_states = hidden_states_wrapper[0]
+ hidden_states_wrapper.clear()
+ residual = hidden_states
+ if attn.spatial_norm is not None:
+ hidden_states = attn.spatial_norm(hidden_states, temb)
+
+ input_ndim = hidden_states.ndim
+
+ if input_ndim == 4:
+ batch_size, channel, height, width = hidden_states.shape
+ hidden_states = hidden_states.view(
+ batch_size, channel, height * width
+ ).transpose(1, 2)
+
+ batch_size, sequence_length, _ = (
+ hidden_states.shape
+ if encoder_hidden_states is None
+ else encoder_hidden_states.shape
+ )
+
+ if skip_layer_mask is not None:
+ skip_layer_mask = skip_layer_mask.reshape(batch_size, 1, 1)
+
+ if (attention_mask is not None) and (not attn.use_tpu_flash_attention):
+ attention_mask = attn.prepare_attention_mask(
+ attention_mask, sequence_length, batch_size
+ )
+ # scaled_dot_product_attention expects attention_mask shape to be
+ # (batch, heads, source_length, target_length)
+ attention_mask = attention_mask.view(
+ batch_size, attn.heads, -1, attention_mask.shape[-1]
+ )
+
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+ 1, 2
+ )
+
+ query = attn.to_q(hidden_states)
+ query = attn.q_norm(query)
+ if encoder_hidden_states is not None:
+ if attn.norm_cross:
+ encoder_hidden_states = attn.norm_encoder_hidden_states(
+ encoder_hidden_states
+ )
+ key = attn.to_k(encoder_hidden_states)
+ key = attn.k_norm(key)
+ else: # if no context provided do self-attention
+ encoder_hidden_states = hidden_states
+ key = attn.to_k(hidden_states)
+ key = attn.k_norm(key)
+ if attn.use_rope:
+ key = attn.apply_rotary_emb(key, freqs_cis)
+ query = attn.apply_rotary_emb(query, freqs_cis)
+ if not (skip_layer_mask is not None and skip_layer_strategy == SkipLayerStrategy.AttentionSkip):
+ del hidden_states
+ skip_attention = False
+ value = attn.to_v(encoder_hidden_states)
+ if skip_layer_mask is not None and skip_layer_strategy == SkipLayerStrategy.AttentionValues:
+ skip_attention = skip_layer_mask.shape[0] == 1 and skip_layer_mask[0].item() == 0
+ value_for_stg = value
+
+ del encoder_hidden_states
+
+
+ inner_dim = key.shape[-1]
+ head_dim = inner_dim // attn.heads
+
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+ dtype = query.dtype
+
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
+ if skip_attention:
+ hidden_states = value_for_stg
+ hidden_states_a = None
+ value_for_stg = None
+ elif attn.use_tpu_flash_attention: # use tpu attention offload 'flash attention'
+ q_segment_indexes = None
+ if (
+ attention_mask is not None
+ ): # if mask is required need to tune both segmenIds fields
+ # attention_mask = torch.squeeze(attention_mask).to(torch.float32)
+ attention_mask = attention_mask.to(torch.float32)
+ q_segment_indexes = torch.ones(
+ batch_size, query.shape[2], device=query.device, dtype=torch.float32
+ )
+ assert (
+ attention_mask.shape[1] == key.shape[2]
+ ), f"ERROR: KEY SHAPE must be same as attention mask [{key.shape[2]}, {attention_mask.shape[1]}]"
+
+ assert (
+ query.shape[2] % 128 == 0
+ ), f"ERROR: QUERY SHAPE must be divisible by 128 (TPU limitation) [{query.shape[2]}]"
+ assert (
+ key.shape[2] % 128 == 0
+ ), f"ERROR: KEY SHAPE must be divisible by 128 (TPU limitation) [{key.shape[2]}]"
+
+ # run the TPU kernel implemented in jax with pallas
+ hidden_states_a = flash_attention(
+ q=query,
+ k=key,
+ v=value,
+ q_segment_ids=q_segment_indexes,
+ kv_segment_ids=attention_mask,
+ sm_scale=attn.scale,
+ )
+ del query, key, value
+ else:
+ query = query.transpose(1,2)
+ key = key.transpose(1,2)
+ value = value.transpose(1,2)
+ if attention_mask != None:
+ attention_mask = attention_mask.transpose(1,2)
+ qkv_list = [query, key, value]
+ del query, key, value
+ hidden_states_a = pay_attention(qkv_list, attention_mask =attention_mask)
+ hidden_states_a = hidden_states_a.transpose(1,2)
+ if hidden_states_a != None:
+ hidden_states_a = hidden_states_a.transpose(1, 2).reshape(
+ batch_size, -1, attn.heads * head_dim
+ )
+ hidden_states_a = hidden_states_a.to(dtype)
+
+ if (
+ skip_layer_mask is not None
+ and skip_layer_strategy == SkipLayerStrategy.AttentionSkip
+ ):
+ hidden_states = hidden_states_a * skip_layer_mask + hidden_states * (
+ 1.0 - skip_layer_mask
+ )
+ elif (
+ skip_layer_mask is not None
+ and skip_layer_strategy == SkipLayerStrategy.AttentionValues
+ ):
+ hidden_states_a *= skip_layer_mask
+ value_for_stg *= 1.0 - skip_layer_mask
+ hidden_states_a += value_for_stg
+ hidden_states = hidden_states_a
+ del value_for_stg
+ else:
+ hidden_states = hidden_states_a
+ hidden_states_a = None
+ # linear proj
+ hidden_states = attn.to_out[0](hidden_states)
+ # dropout
+ hidden_states = attn.to_out[1](hidden_states)
+
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose(-1, -2).reshape(
+ batch_size, channel, height, width
+ )
+ if (
+ skip_layer_mask is not None
+ and skip_layer_strategy == SkipLayerStrategy.Residual
+ ):
+ skip_layer_mask = skip_layer_mask.reshape(batch_size, 1, 1, 1)
+
+ if attn.residual_connection:
+ if (
+ skip_layer_mask is not None
+ and skip_layer_strategy == SkipLayerStrategy.Residual
+ ):
+ hidden_states = hidden_states + residual * skip_layer_mask
+ else:
+ hidden_states = hidden_states + residual
+
+ if attn.rescale_output_factor != 1.0:
+ hidden_states /= attn.rescale_output_factor
+
+ return hidden_states
+
+
+class AttnProcessor:
+ r"""
+ Default processor for performing attention-related computations.
+ """
+
+ def __call__(
+ self,
+ attn: Attention,
+ hidden_states: torch.FloatTensor,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ temb: Optional[torch.FloatTensor] = None,
+ *args,
+ **kwargs,
+ ) -> torch.Tensor:
+ if len(args) > 0 or kwargs.get("scale", None) is not None:
+ deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+ deprecate("scale", "1.0.0", deprecation_message)
+
+ residual = hidden_states
+
+ if attn.spatial_norm is not None:
+ hidden_states = attn.spatial_norm(hidden_states, temb)
+
+ input_ndim = hidden_states.ndim
+
+ if input_ndim == 4:
+ batch_size, channel, height, width = hidden_states.shape
+ hidden_states = hidden_states.view(
+ batch_size, channel, height * width
+ ).transpose(1, 2)
+
+ batch_size, sequence_length, _ = (
+ hidden_states.shape
+ if encoder_hidden_states is None
+ else encoder_hidden_states.shape
+ )
+ attention_mask = attn.prepare_attention_mask(
+ attention_mask, sequence_length, batch_size
+ )
+
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+ 1, 2
+ )
+
+ query = attn.to_q(hidden_states)
+
+ if encoder_hidden_states is None:
+ encoder_hidden_states = hidden_states
+ elif attn.norm_cross:
+ encoder_hidden_states = attn.norm_encoder_hidden_states(
+ encoder_hidden_states
+ )
+
+ key = attn.to_k(encoder_hidden_states)
+ value = attn.to_v(encoder_hidden_states)
+
+ query = attn.head_to_batch_dim(query)
+ key = attn.head_to_batch_dim(key)
+ value = attn.head_to_batch_dim(value)
+
+ query = attn.q_norm(query)
+ key = attn.k_norm(key)
+
+ attention_probs = attn.get_attention_scores(query, key, attention_mask)
+ hidden_states = torch.bmm(attention_probs, value)
+ hidden_states = attn.batch_to_head_dim(hidden_states)
+
+ # linear proj
+ hidden_states = attn.to_out[0](hidden_states)
+ # dropout
+ hidden_states = attn.to_out[1](hidden_states)
+
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose(-1, -2).reshape(
+ batch_size, channel, height, width
+ )
+
+ if attn.residual_connection:
+ hidden_states = hidden_states + residual
+
+ hidden_states = hidden_states / attn.rescale_output_factor
+
+ return hidden_states
+
+
+class FeedForward(nn.Module):
+ r"""
+ A feed-forward layer.
+
+ Parameters:
+ dim (`int`): The number of channels in the input.
+ dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+ mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+ final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+ bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+ """
+
+ def __init__(
+ self,
+ dim: int,
+ dim_out: Optional[int] = None,
+ mult: int = 4,
+ dropout: float = 0.0,
+ activation_fn: str = "geglu",
+ final_dropout: bool = False,
+ inner_dim=None,
+ bias: bool = True,
+ ):
+ super().__init__()
+ if inner_dim is None:
+ inner_dim = int(dim * mult)
+ dim_out = dim_out if dim_out is not None else dim
+ linear_cls = nn.Linear
+
+ if activation_fn == "gelu":
+ act_fn = GELU(dim, inner_dim, bias=bias)
+ elif activation_fn == "gelu-approximate":
+ act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+ elif activation_fn == "geglu":
+ act_fn = GEGLU(dim, inner_dim, bias=bias)
+ elif activation_fn == "geglu-approximate":
+ act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+ else:
+ raise ValueError(f"Unsupported activation function: {activation_fn}")
+
+ self.net = nn.ModuleList([])
+ # project in
+ self.net.append(act_fn)
+ # project dropout
+ self.net.append(nn.Dropout(dropout))
+ # project out
+ self.net.append(linear_cls(inner_dim, dim_out, bias=bias))
+ # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+ if final_dropout:
+ self.net.append(nn.Dropout(dropout))
+
+ def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+ compatible_cls = (GEGLU, LoRACompatibleLinear)
+ for module in self.net:
+ if isinstance(module, compatible_cls):
+ hidden_states = module(hidden_states, scale)
+ else:
+ hidden_states = module(hidden_states)
+ return hidden_states
diff --git a/ltx_video/models/transformers/embeddings.py b/ltx_video/models/transformers/embeddings.py
new file mode 100644
index 0000000000000000000000000000000000000000..a30d6be16b4f3fe709cf24465e06eb798889ba66
--- /dev/null
+++ b/ltx_video/models/transformers/embeddings.py
@@ -0,0 +1,129 @@
+# Adapted from: https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py
+import math
+
+import numpy as np
+import torch
+from einops import rearrange
+from torch import nn
+
+
+def get_timestep_embedding(
+ timesteps: torch.Tensor,
+ embedding_dim: int,
+ flip_sin_to_cos: bool = False,
+ downscale_freq_shift: float = 1,
+ scale: float = 1,
+ max_period: int = 10000,
+):
+ """
+ This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+
+ :param timesteps: a 1-D Tensor of N indices, one per batch element.
+ These may be fractional.
+ :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+ embeddings. :return: an [N x dim] Tensor of positional embeddings.
+ """
+ assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+
+ half_dim = embedding_dim // 2
+ exponent = -math.log(max_period) * torch.arange(
+ start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+ )
+ exponent = exponent / (half_dim - downscale_freq_shift)
+
+ emb = torch.exp(exponent)
+ emb = timesteps[:, None].float() * emb[None, :]
+
+ # scale embeddings
+ emb = scale * emb
+
+ # concat sine and cosine embeddings
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+
+ # flip sine and cosine embeddings
+ if flip_sin_to_cos:
+ emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+
+ # zero pad
+ if embedding_dim % 2 == 1:
+ emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+ return emb
+
+
+def get_3d_sincos_pos_embed(embed_dim, grid, w, h, f):
+ """
+ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+ [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ grid = rearrange(grid, "c (f h w) -> c f h w", h=h, w=w)
+ grid = rearrange(grid, "c f h w -> c h w f", h=h, w=w)
+ grid = grid.reshape([3, 1, w, h, f])
+ pos_embed = get_3d_sincos_pos_embed_from_grid(embed_dim, grid)
+ pos_embed = pos_embed.transpose(1, 0, 2, 3)
+ return rearrange(pos_embed, "h w f c -> (f h w) c")
+
+
+def get_3d_sincos_pos_embed_from_grid(embed_dim, grid):
+ if embed_dim % 3 != 0:
+ raise ValueError("embed_dim must be divisible by 3")
+
+ # use half of dimensions to encode grid_h
+ emb_f = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[0]) # (H*W*T, D/3)
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[1]) # (H*W*T, D/3)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[2]) # (H*W*T, D/3)
+
+ emb = np.concatenate([emb_h, emb_w, emb_f], axis=-1) # (H*W*T, D)
+ return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+ """
+ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+ """
+ if embed_dim % 2 != 0:
+ raise ValueError("embed_dim must be divisible by 2")
+
+ omega = np.arange(embed_dim // 2, dtype=np.float64)
+ omega /= embed_dim / 2.0
+ omega = 1.0 / 10000**omega # (D/2,)
+
+ pos_shape = pos.shape
+
+ pos = pos.reshape(-1)
+ out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product
+ out = out.reshape([*pos_shape, -1])[0]
+
+ emb_sin = np.sin(out) # (M, D/2)
+ emb_cos = np.cos(out) # (M, D/2)
+
+ emb = np.concatenate([emb_sin, emb_cos], axis=-1) # (M, D)
+ return emb
+
+
+class SinusoidalPositionalEmbedding(nn.Module):
+ """Apply positional information to a sequence of embeddings.
+
+ Takes in a sequence of embeddings with shape (batch_size, seq_length, embed_dim) and adds positional embeddings to
+ them
+
+ Args:
+ embed_dim: (int): Dimension of the positional embedding.
+ max_seq_length: Maximum sequence length to apply positional embeddings
+
+ """
+
+ def __init__(self, embed_dim: int, max_seq_length: int = 32):
+ super().__init__()
+ position = torch.arange(max_seq_length).unsqueeze(1)
+ div_term = torch.exp(
+ torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim)
+ )
+ pe = torch.zeros(1, max_seq_length, embed_dim)
+ pe[0, :, 0::2] = torch.sin(position * div_term)
+ pe[0, :, 1::2] = torch.cos(position * div_term)
+ self.register_buffer("pe", pe)
+
+ def forward(self, x):
+ _, seq_length, _ = x.shape
+ x = x + self.pe[:, :seq_length]
+ return x
diff --git a/ltx_video/models/transformers/symmetric_patchifier.py b/ltx_video/models/transformers/symmetric_patchifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eca32033eef03c0dbffd7a25cca993bbda57ded
--- /dev/null
+++ b/ltx_video/models/transformers/symmetric_patchifier.py
@@ -0,0 +1,84 @@
+from abc import ABC, abstractmethod
+from typing import Tuple
+
+import torch
+from diffusers.configuration_utils import ConfigMixin
+from einops import rearrange
+from torch import Tensor
+
+
+class Patchifier(ConfigMixin, ABC):
+ def __init__(self, patch_size: int):
+ super().__init__()
+ self._patch_size = (1, patch_size, patch_size)
+
+ @abstractmethod
+ def patchify(self, latents: Tensor) -> Tuple[Tensor, Tensor]:
+ raise NotImplementedError("Patchify method not implemented")
+
+ @abstractmethod
+ def unpatchify(
+ self,
+ latents: Tensor,
+ output_height: int,
+ output_width: int,
+ out_channels: int,
+ ) -> Tuple[Tensor, Tensor]:
+ pass
+
+ @property
+ def patch_size(self):
+ return self._patch_size
+
+ def get_latent_coords(
+ self, latent_num_frames, latent_height, latent_width, batch_size, device
+ ):
+ """
+ Return a tensor of shape [batch_size, 3, num_patches] containing the
+ top-left corner latent coordinates of each latent patch.
+ The tensor is repeated for each batch element.
+ """
+ latent_sample_coords = torch.meshgrid(
+ torch.arange(0, latent_num_frames, self._patch_size[0], device=device),
+ torch.arange(0, latent_height, self._patch_size[1], device=device),
+ torch.arange(0, latent_width, self._patch_size[2], device=device),
+ )
+ latent_sample_coords = torch.stack(latent_sample_coords, dim=0)
+ latent_coords = latent_sample_coords.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+ latent_coords = rearrange(
+ latent_coords, "b c f h w -> b c (f h w)", b=batch_size
+ )
+ return latent_coords
+
+
+class SymmetricPatchifier(Patchifier):
+ def patchify(self, latents: Tensor) -> Tuple[Tensor, Tensor]:
+ b, _, f, h, w = latents.shape
+ latent_coords = self.get_latent_coords(f, h, w, b, latents.device)
+ latents = rearrange(
+ latents,
+ "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)",
+ p1=self._patch_size[0],
+ p2=self._patch_size[1],
+ p3=self._patch_size[2],
+ )
+ return latents, latent_coords
+
+ def unpatchify(
+ self,
+ latents: Tensor,
+ output_height: int,
+ output_width: int,
+ out_channels: int,
+ ) -> Tuple[Tensor, Tensor]:
+ output_height = output_height // self._patch_size[1]
+ output_width = output_width // self._patch_size[2]
+ latents = rearrange(
+ latents,
+ "b (f h w) (c p q) -> b c f (h p) (w q)",
+ h=output_height,
+ w=output_width,
+ p=self._patch_size[1],
+ q=self._patch_size[2],
+ )
+ return latents
diff --git a/ltx_video/models/transformers/transformer3d.py b/ltx_video/models/transformers/transformer3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e182f21d00bd9773af61016291685eea5c2a1d14
--- /dev/null
+++ b/ltx_video/models/transformers/transformer3d.py
@@ -0,0 +1,507 @@
+# Adapted from: https://github.com/huggingface/diffusers/blob/v0.26.3/src/diffusers/models/transformers/transformer_2d.py
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+import os
+import json
+import glob
+from pathlib import Path
+
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import PixArtAlphaTextProjection
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormSingle
+from diffusers.utils import BaseOutput, is_torch_version
+from diffusers.utils import logging
+from torch import nn
+from safetensors import safe_open
+from ltx_video.models.transformers.attention import BasicTransformerBlock, reshape_hidden_states, restore_hidden_states_shape
+from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
+
+from ltx_video.utils.diffusers_config_mapping import (
+ diffusers_and_ours_config_mapping,
+ make_hashable_key,
+ TRANSFORMER_KEYS_RENAME_DICT,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class Transformer3DModelOutput(BaseOutput):
+ """
+ The output of [`Transformer2DModel`].
+
+ Args:
+ sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+ The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+ distributions for the unnoised latent pixels.
+ """
+
+ sample: torch.FloatTensor
+
+
+class Transformer3DModel(ModelMixin, ConfigMixin):
+ _supports_gradient_checkpointing = True
+
+ @register_to_config
+ def __init__(
+ self,
+ num_attention_heads: int = 16,
+ attention_head_dim: int = 88,
+ in_channels: Optional[int] = None,
+ out_channels: Optional[int] = None,
+ num_layers: int = 1,
+ dropout: float = 0.0,
+ norm_num_groups: int = 32,
+ cross_attention_dim: Optional[int] = None,
+ attention_bias: bool = False,
+ num_vector_embeds: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ use_linear_projection: bool = False,
+ only_cross_attention: bool = False,
+ double_self_attention: bool = False,
+ upcast_attention: bool = False,
+ adaptive_norm: str = "single_scale_shift", # 'single_scale_shift' or 'single_scale'
+ standardization_norm: str = "layer_norm", # 'layer_norm' or 'rms_norm'
+ norm_elementwise_affine: bool = True,
+ norm_eps: float = 1e-5,
+ attention_type: str = "default",
+ caption_channels: int = None,
+ use_tpu_flash_attention: bool = False, # if True uses the TPU attention offload ('flash attention')
+ qk_norm: Optional[str] = None,
+ positional_embedding_type: str = "rope",
+ positional_embedding_theta: Optional[float] = None,
+ positional_embedding_max_pos: Optional[List[int]] = None,
+ timestep_scale_multiplier: Optional[float] = None,
+ causal_temporal_positioning: bool = False, # For backward compatibility, will be deprecated
+ ):
+ super().__init__()
+ self.use_tpu_flash_attention = (
+ use_tpu_flash_attention # FIXME: push config down to the attention modules
+ )
+ self.use_linear_projection = use_linear_projection
+ self.num_attention_heads = num_attention_heads
+ self.attention_head_dim = attention_head_dim
+ inner_dim = num_attention_heads * attention_head_dim
+ self.inner_dim = inner_dim
+ self.patchify_proj = nn.Linear(in_channels, inner_dim, bias=True)
+ self.positional_embedding_type = positional_embedding_type
+ self.positional_embedding_theta = positional_embedding_theta
+ self.positional_embedding_max_pos = positional_embedding_max_pos
+ self.use_rope = self.positional_embedding_type == "rope"
+ self.timestep_scale_multiplier = timestep_scale_multiplier
+
+ if self.positional_embedding_type == "absolute":
+ raise ValueError("Absolute positional embedding is no longer supported")
+ elif self.positional_embedding_type == "rope":
+ if positional_embedding_theta is None:
+ raise ValueError(
+ "If `positional_embedding_type` type is rope, `positional_embedding_theta` must also be defined"
+ )
+ if positional_embedding_max_pos is None:
+ raise ValueError(
+ "If `positional_embedding_type` type is rope, `positional_embedding_max_pos` must also be defined"
+ )
+
+ # 3. Define transformers blocks
+ self.transformer_blocks = nn.ModuleList(
+ [
+ BasicTransformerBlock(
+ inner_dim,
+ num_attention_heads,
+ attention_head_dim,
+ dropout=dropout,
+ cross_attention_dim=cross_attention_dim,
+ activation_fn=activation_fn,
+ num_embeds_ada_norm=num_embeds_ada_norm,
+ attention_bias=attention_bias,
+ only_cross_attention=only_cross_attention,
+ double_self_attention=double_self_attention,
+ upcast_attention=upcast_attention,
+ adaptive_norm=adaptive_norm,
+ standardization_norm=standardization_norm,
+ norm_elementwise_affine=norm_elementwise_affine,
+ norm_eps=norm_eps,
+ attention_type=attention_type,
+ use_tpu_flash_attention=use_tpu_flash_attention,
+ qk_norm=qk_norm,
+ use_rope=self.use_rope,
+ )
+ for d in range(num_layers)
+ ]
+ )
+
+ # 4. Define output layers
+ self.out_channels = in_channels if out_channels is None else out_channels
+ self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+ self.scale_shift_table = nn.Parameter(
+ torch.randn(2, inner_dim) / inner_dim**0.5
+ )
+ self.proj_out = nn.Linear(inner_dim, self.out_channels)
+
+ self.adaln_single = AdaLayerNormSingle(
+ inner_dim, use_additional_conditions=False
+ )
+ if adaptive_norm == "single_scale":
+ self.adaln_single.linear = nn.Linear(inner_dim, 4 * inner_dim, bias=True)
+
+ self.caption_projection = None
+ if caption_channels is not None:
+ self.caption_projection = PixArtAlphaTextProjection(
+ in_features=caption_channels, hidden_size=inner_dim
+ )
+
+ self.gradient_checkpointing = False
+
+ def set_use_tpu_flash_attention(self):
+ r"""
+ Function sets the flag in this object and propagates down the children. The flag will enforce the usage of TPU
+ attention kernel.
+ """
+ logger.info("ENABLE TPU FLASH ATTENTION -> TRUE")
+ self.use_tpu_flash_attention = True
+ # push config down to the attention modules
+ for block in self.transformer_blocks:
+ block.set_use_tpu_flash_attention()
+
+ def create_skip_layer_mask(
+ self,
+ batch_size: int,
+ num_conds: int,
+ ptb_index: int,
+ skip_block_list: Optional[List[int]] = None,
+ ):
+ if skip_block_list is None or len(skip_block_list) == 0:
+ return None
+ num_layers = len(self.transformer_blocks)
+ mask = torch.ones(
+ (num_layers, batch_size * num_conds), device=self.device, dtype=self.dtype
+ )
+ for block_idx in skip_block_list:
+ mask[block_idx, ptb_index::num_conds] = 0
+ return mask
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if hasattr(module, "gradient_checkpointing"):
+ module.gradient_checkpointing = value
+
+ def get_fractional_positions(self, indices_grid):
+ fractional_positions = torch.stack(
+ [
+ indices_grid[:, i] / self.positional_embedding_max_pos[i]
+ for i in range(3)
+ ],
+ dim=-1,
+ )
+ return fractional_positions
+
+ def precompute_freqs_cis(self, indices_grid, spacing="exp"):
+ dtype = torch.float32 # We need full precision in the freqs_cis computation.
+ dim = self.inner_dim
+ theta = self.positional_embedding_theta
+
+ fractional_positions = self.get_fractional_positions(indices_grid)
+
+ start = 1
+ end = theta
+ device = fractional_positions.device
+ if spacing == "exp":
+ indices = theta ** (
+ torch.linspace(
+ math.log(start, theta),
+ math.log(end, theta),
+ dim // 6,
+ device=device,
+ dtype=dtype,
+ )
+ )
+ indices = indices.to(dtype=dtype)
+ elif spacing == "exp_2":
+ indices = 1.0 / theta ** (torch.arange(0, dim, 6, device=device) / dim)
+ indices = indices.to(dtype=dtype)
+ elif spacing == "linear":
+ indices = torch.linspace(start, end, dim // 6, device=device, dtype=dtype)
+ elif spacing == "sqrt":
+ indices = torch.linspace(
+ start**2, end**2, dim // 6, device=device, dtype=dtype
+ ).sqrt()
+
+ indices = indices * math.pi / 2
+
+ if spacing == "exp_2":
+ freqs = (
+ (indices * fractional_positions.unsqueeze(-1))
+ .transpose(-1, -2)
+ .flatten(2)
+ )
+ else:
+ freqs = (
+ (indices * (fractional_positions.unsqueeze(-1) * 2 - 1))
+ .transpose(-1, -2)
+ .flatten(2)
+ )
+
+ cos_freq = freqs.cos().repeat_interleave(2, dim=-1)
+ sin_freq = freqs.sin().repeat_interleave(2, dim=-1)
+ if dim % 6 != 0:
+ cos_padding = torch.ones_like(cos_freq[:, :, : dim % 6])
+ sin_padding = torch.zeros_like(cos_freq[:, :, : dim % 6])
+ cos_freq = torch.cat([cos_padding, cos_freq], dim=-1)
+ sin_freq = torch.cat([sin_padding, sin_freq], dim=-1)
+ return cos_freq.to(self.dtype), sin_freq.to(self.dtype)
+
+ def load_state_dict(
+ self,
+ state_dict: Dict,
+ *args,
+ **kwargs,
+ ):
+ if any([key.startswith("model.diffusion_model.") for key in state_dict.keys()]):
+ state_dict = {
+ key.replace("model.diffusion_model.", ""): value
+ for key, value in state_dict.items()
+ if key.startswith("model.diffusion_model.")
+ }
+ return super().load_state_dict(state_dict, **kwargs)
+
+ @classmethod
+ def from_pretrained(
+ cls,
+ pretrained_model_path: Optional[Union[str, os.PathLike]],
+ *args,
+ **kwargs,
+ ):
+ pretrained_model_path = Path(pretrained_model_path)
+ if pretrained_model_path.is_dir():
+ config_path = pretrained_model_path / "transformer" / "config.json"
+ with open(config_path, "r") as f:
+ config = make_hashable_key(json.load(f))
+
+ assert config in diffusers_and_ours_config_mapping, (
+ "Provided diffusers checkpoint config for transformer is not suppported. "
+ "We only support diffusers configs found in Lightricks/LTX-Video."
+ )
+
+ config = diffusers_and_ours_config_mapping[config]
+ state_dict = {}
+ ckpt_paths = (
+ pretrained_model_path
+ / "transformer"
+ / "diffusion_pytorch_model*.safetensors"
+ )
+ dict_list = glob.glob(str(ckpt_paths))
+ for dict_path in dict_list:
+ part_dict = {}
+ with safe_open(dict_path, framework="pt", device="cpu") as f:
+ for k in f.keys():
+ part_dict[k] = f.get_tensor(k)
+ state_dict.update(part_dict)
+
+ for key in list(state_dict.keys()):
+ new_key = key
+ for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
+ new_key = new_key.replace(replace_key, rename_key)
+ state_dict[new_key] = state_dict.pop(key)
+
+ with torch.device("meta"):
+ transformer = cls.from_config(config)
+ transformer.load_state_dict(state_dict, assign=True, strict=True)
+ elif pretrained_model_path.is_file() and str(pretrained_model_path).endswith(
+ ".safetensors"
+ ):
+ comfy_single_file_state_dict = {}
+ with safe_open(pretrained_model_path, framework="pt", device="cpu") as f:
+ metadata = f.metadata()
+ for k in f.keys():
+ comfy_single_file_state_dict[k] = f.get_tensor(k)
+ configs = json.loads(metadata["config"])
+ transformer_config = configs["transformer"]
+ with torch.device("meta"):
+ transformer = Transformer3DModel.from_config(transformer_config)
+ transformer.load_state_dict(comfy_single_file_state_dict, assign=True)
+ return transformer
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ freqs_cis: list,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ timestep: Optional[torch.LongTensor] = None,
+ class_labels: Optional[torch.LongTensor] = None,
+ cross_attention_kwargs: Dict[str, Any] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
+ skip_layer_mask: Optional[torch.Tensor] = None,
+ skip_layer_strategy: Optional[SkipLayerStrategy] = None,
+ latent_shape = None,
+ joint_pass = True,
+ ltxv_model = None,
+ mixed = False,
+ return_dict: bool = True,
+ ):
+ """
+ The [`Transformer2DModel`] forward method.
+
+ Args:
+ hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+ Input `hidden_states`.
+ indices_grid (`torch.LongTensor` of shape `(batch size, 3, num latent pixels)`):
+ encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+ Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+ self-attention.
+ timestep ( `torch.LongTensor`, *optional*):
+ Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+ class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+ Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+ `AdaLayerZeroNorm`.
+ cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+ `self.processor` in
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+ attention_mask ( `torch.Tensor`, *optional*):
+ An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+ is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+ negative values to the attention scores corresponding to "discard" tokens.
+ encoder_attention_mask ( `torch.Tensor`, *optional*):
+ Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+
+ * Mask `(batch, sequence_length)` True = keep, False = discard.
+ * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+
+ If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+ above. This bias will be added to the cross-attention scores.
+ skip_layer_mask ( `torch.Tensor`, *optional*):
+ A mask of shape `(num_layers, batch)` that indicates which layers to skip. `0` at position
+ `layer, batch_idx` indicates that the layer should be skipped for the corresponding batch index.
+ skip_layer_strategy ( `SkipLayerStrategy`, *optional*, defaults to `None`):
+ Controls which layers are skipped when calculating a perturbed latent for spatiotemporal guidance.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+ tuple.
+
+ Returns:
+ If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+ `tuple` where the first element is the sample tensor.
+ """
+ # for tpu attention offload 2d token masks are used. No need to transform.
+ if not self.use_tpu_flash_attention:
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+ # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+ # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+ # expects mask of shape:
+ # [batch, key_tokens]
+ # adds singleton query_tokens dimension:
+ # [batch, 1, key_tokens]
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+ if attention_mask is not None and attention_mask.ndim == 2:
+ # assume that mask is expressed as:
+ # (1 = keep, 0 = discard)
+ # convert mask into a bias that can be added to attention scores:
+ # (keep = +0, discard = -10000.0)
+ attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+ attention_mask = attention_mask.unsqueeze(1)
+
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
+ if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+ encoder_attention_mask = (
+ 1 - encoder_attention_mask.to(hidden_states.dtype)
+ ) * -10000.0
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+ # 1. Input
+ hidden_states = self.patchify_proj(hidden_states)
+
+ if self.timestep_scale_multiplier:
+ timestep = self.timestep_scale_multiplier * timestep
+
+ if timestep.shape[-1] > 1:
+ timestep = timestep.reshape(timestep.shape[0], -1, latent_shape[-2] * latent_shape[-1] )
+ timestep = timestep[:, :, 0]
+
+ batch_size = hidden_states.shape[0]
+ timestep, embedded_timestep = self.adaln_single(
+ timestep.flatten(),
+ {"resolution": None, "aspect_ratio": None},
+ batch_size=batch_size,
+ hidden_dtype=hidden_states.dtype,
+ )
+ # Second dimension is 1 or number of tokens (if timestep_per_token)
+ timestep = timestep.view(batch_size, -1, timestep.shape[-1])
+ embedded_timestep = embedded_timestep.view(
+ batch_size, -1, embedded_timestep.shape[-1]
+ )
+ if mixed:
+ timestep = timestep.float()
+ embedded_timestep = embedded_timestep.float()
+ hidden_states = hidden_states.float()
+
+
+ # 2. Blocks
+ if self.caption_projection is not None:
+ batch_size = hidden_states.shape[0]
+ encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+ encoder_hidden_states = encoder_hidden_states.view(
+ batch_size, -1, hidden_states.shape[-1]
+ )
+
+
+ if joint_pass:
+ for block_idx, block in enumerate(self.transformer_blocks):
+ hidden_states = block(
+ hidden_states,
+ freqs_cis=freqs_cis,
+ attention_mask=attention_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ timestep=timestep,
+ cross_attention_kwargs=cross_attention_kwargs,
+ class_labels=class_labels,
+ skip_layer_mask= None if skip_layer_mask is None else skip_layer_mask[block_idx],
+ skip_layer_strategy=skip_layer_strategy,
+ )
+ if ltxv_model._interrupt:
+ return [None]
+
+ else:
+ for block_idx, block in enumerate(self.transformer_blocks):
+ for i, (one_hidden_states, one_encoder_hidden_states, one_encoder_attention_mask,one_timestep) in enumerate(zip(hidden_states, encoder_hidden_states,encoder_attention_mask,timestep)):
+ hidden_states[i][...] = block(
+ one_hidden_states.unsqueeze(0),
+ freqs_cis=freqs_cis,
+ attention_mask=attention_mask,
+ encoder_hidden_states=one_encoder_hidden_states.unsqueeze(0),
+ encoder_attention_mask=one_encoder_attention_mask.unsqueeze(0),
+ timestep=one_timestep.unsqueeze(0),
+ cross_attention_kwargs=cross_attention_kwargs,
+ class_labels=class_labels,
+ skip_layer_mask= None if skip_layer_mask is None else skip_layer_mask[block_idx, i],
+ skip_layer_strategy=skip_layer_strategy,
+ )
+ if ltxv_model._interrupt:
+ return [None]
+
+ # 3. Output
+ scale_shift_values = (
+ self.scale_shift_table[None, None] + embedded_timestep[:, :, None]
+ )
+ shift, scale = scale_shift_values[:, :, 0].unsqueeze(-2), scale_shift_values[:, :, 1].unsqueeze(-2)
+ hidden_states = self.norm_out(hidden_states)
+ # Modulation
+
+
+ hidden_states = reshape_hidden_states(hidden_states, scale.shape[1])
+ # hidden_states = hidden_states * (1 + scale)
+ hidden_states *= 1 + scale
+ hidden_states += shift
+ hidden_states = restore_hidden_states_shape(hidden_states)
+ hidden_states = self.proj_out(hidden_states)
+ if not return_dict:
+ return (hidden_states,)
+
+ return Transformer3DModelOutput(sample=hidden_states)
diff --git a/ltx_video/pipelines/__init__.py b/ltx_video/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ltx_video/pipelines/crf_compressor.py b/ltx_video/pipelines/crf_compressor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b9380afb7f92e0a2379c9db4cf5ce9f5a20942c
--- /dev/null
+++ b/ltx_video/pipelines/crf_compressor.py
@@ -0,0 +1,50 @@
+import av
+import torch
+import io
+import numpy as np
+
+
+def _encode_single_frame(output_file, image_array: np.ndarray, crf):
+ container = av.open(output_file, "w", format="mp4")
+ try:
+ stream = container.add_stream(
+ "libx264", rate=1, options={"crf": str(crf), "preset": "veryfast"}
+ )
+ stream.height = image_array.shape[0]
+ stream.width = image_array.shape[1]
+ av_frame = av.VideoFrame.from_ndarray(image_array, format="rgb24").reformat(
+ format="yuv420p"
+ )
+ container.mux(stream.encode(av_frame))
+ container.mux(stream.encode())
+ finally:
+ container.close()
+
+
+def _decode_single_frame(video_file):
+ container = av.open(video_file)
+ try:
+ stream = next(s for s in container.streams if s.type == "video")
+ frame = next(container.decode(stream))
+ finally:
+ container.close()
+ return frame.to_ndarray(format="rgb24")
+
+
+def compress(image: torch.Tensor, crf=29):
+ if crf == 0:
+ return image
+
+ image_array = (
+ (image[: (image.shape[0] // 2) * 2, : (image.shape[1] // 2) * 2] * 255.0)
+ .byte()
+ .cpu()
+ .numpy()
+ )
+ with io.BytesIO() as output_file:
+ _encode_single_frame(output_file, image_array, crf)
+ video_bytes = output_file.getvalue()
+ with io.BytesIO(video_bytes) as video_file:
+ image_array = _decode_single_frame(video_file)
+ tensor = torch.tensor(image_array, dtype=image.dtype, device=image.device) / 255.0
+ return tensor
diff --git a/ltx_video/pipelines/pipeline_ltx_video.py b/ltx_video/pipelines/pipeline_ltx_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff610724693dbb41bdde803ec1b78aac6a8924ad
--- /dev/null
+++ b/ltx_video/pipelines/pipeline_ltx_video.py
@@ -0,0 +1,1903 @@
+# Adapted from: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
+import copy
+import inspect
+import math
+import re
+from contextlib import nullcontext
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.schedulers import DPMSolverMultistepScheduler
+from diffusers.utils import deprecate, logging
+from diffusers.utils.torch_utils import randn_tensor
+from einops import rearrange
+from transformers import (
+ T5EncoderModel,
+ T5Tokenizer,
+ AutoModelForCausalLM,
+ AutoProcessor,
+ AutoTokenizer,
+)
+
+from ltx_video.models.autoencoders.causal_video_autoencoder import (
+ CausalVideoAutoencoder,
+)
+from ltx_video.models.autoencoders.vae_encode import (
+ get_vae_size_scale_factor,
+ latent_to_pixel_coords,
+ vae_decode,
+ vae_encode,
+)
+from ltx_video.models.transformers.symmetric_patchifier import Patchifier
+from ltx_video.models.transformers.transformer3d import Transformer3DModel
+from ltx_video.schedulers.rf import TimestepShifter
+from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
+from ltx_video.utils.prompt_enhance_utils import generate_cinematic_prompt
+from ltx_video.models.autoencoders.latent_upsampler import LatentUpsampler
+from ltx_video.models.autoencoders.vae_encode import (
+ un_normalize_latents,
+ normalize_latents,
+)
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+ASPECT_RATIO_1024_BIN = {
+ "0.25": [512.0, 2048.0],
+ "0.28": [512.0, 1856.0],
+ "0.32": [576.0, 1792.0],
+ "0.33": [576.0, 1728.0],
+ "0.35": [576.0, 1664.0],
+ "0.4": [640.0, 1600.0],
+ "0.42": [640.0, 1536.0],
+ "0.48": [704.0, 1472.0],
+ "0.5": [704.0, 1408.0],
+ "0.52": [704.0, 1344.0],
+ "0.57": [768.0, 1344.0],
+ "0.6": [768.0, 1280.0],
+ "0.68": [832.0, 1216.0],
+ "0.72": [832.0, 1152.0],
+ "0.78": [896.0, 1152.0],
+ "0.82": [896.0, 1088.0],
+ "0.88": [960.0, 1088.0],
+ "0.94": [960.0, 1024.0],
+ "1.0": [1024.0, 1024.0],
+ "1.07": [1024.0, 960.0],
+ "1.13": [1088.0, 960.0],
+ "1.21": [1088.0, 896.0],
+ "1.29": [1152.0, 896.0],
+ "1.38": [1152.0, 832.0],
+ "1.46": [1216.0, 832.0],
+ "1.67": [1280.0, 768.0],
+ "1.75": [1344.0, 768.0],
+ "2.0": [1408.0, 704.0],
+ "2.09": [1472.0, 704.0],
+ "2.4": [1536.0, 640.0],
+ "2.5": [1600.0, 640.0],
+ "3.0": [1728.0, 576.0],
+ "4.0": [2048.0, 512.0],
+}
+
+ASPECT_RATIO_512_BIN = {
+ "0.25": [256.0, 1024.0],
+ "0.28": [256.0, 928.0],
+ "0.32": [288.0, 896.0],
+ "0.33": [288.0, 864.0],
+ "0.35": [288.0, 832.0],
+ "0.4": [320.0, 800.0],
+ "0.42": [320.0, 768.0],
+ "0.48": [352.0, 736.0],
+ "0.5": [352.0, 704.0],
+ "0.52": [352.0, 672.0],
+ "0.57": [384.0, 672.0],
+ "0.6": [384.0, 640.0],
+ "0.68": [416.0, 608.0],
+ "0.72": [416.0, 576.0],
+ "0.78": [448.0, 576.0],
+ "0.82": [448.0, 544.0],
+ "0.88": [480.0, 544.0],
+ "0.94": [480.0, 512.0],
+ "1.0": [512.0, 512.0],
+ "1.07": [512.0, 480.0],
+ "1.13": [544.0, 480.0],
+ "1.21": [544.0, 448.0],
+ "1.29": [576.0, 448.0],
+ "1.38": [576.0, 416.0],
+ "1.46": [608.0, 416.0],
+ "1.67": [640.0, 384.0],
+ "1.75": [672.0, 384.0],
+ "2.0": [704.0, 352.0],
+ "2.09": [736.0, 352.0],
+ "2.4": [768.0, 320.0],
+ "2.5": [800.0, 320.0],
+ "3.0": [864.0, 288.0],
+ "4.0": [1024.0, 256.0],
+}
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+ scheduler,
+ num_inference_steps: Optional[int] = None,
+ device: Optional[Union[str, torch.device]] = None,
+ timesteps: Optional[List[int]] = None,
+ max_timestep: Optional[float] = 1.0,
+ skip_initial_inference_steps: int = 0,
+ skip_final_inference_steps: int = 0,
+ **kwargs,
+):
+ """
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+ Args:
+ scheduler (`SchedulerMixin`):
+ The scheduler to get timesteps from.
+ num_inference_steps (`int`):
+ The number of diffusion steps used when generating samples with a pre-trained model. If used,
+ `timesteps` must be `None`.
+ device (`str` or `torch.device`, *optional*):
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+ timesteps (`List[int]`, *optional*):
+ Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+ timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+ must be `None`.
+ max_timestep ('float', *optional*, defaults to 1.0):
+ The initial noising level for image-to-image/video-to-video. The list if timestamps will be
+ truncated to start with a timestamp greater or equal to this.
+
+ Returns:
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+ second element is the number of inference steps.
+ """
+ if timesteps is not None:
+ accepts_timesteps = "timesteps" in set(
+ inspect.signature(scheduler.set_timesteps).parameters.keys()
+ )
+ if not accepts_timesteps:
+ raise ValueError(
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+ f" timestep schedules. Please check whether you are using the correct scheduler."
+ )
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+ timesteps = scheduler.timesteps
+ num_inference_steps = len(timesteps)
+ else:
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+ timesteps = scheduler.timesteps
+
+ if (
+ skip_initial_inference_steps < 0
+ or skip_final_inference_steps < 0
+ or skip_initial_inference_steps + skip_final_inference_steps
+ >= num_inference_steps
+ ):
+ raise ValueError(
+ f"max_timestep {max_timestep} is smaller than the minimum timestep {timesteps.min()}"
+ "invalid skip inference step values: must be non-negative and the sum of skip_initial_inference_steps and skip_final_inference_steps must be less than the number of inference steps"
+ )
+
+ timesteps = timesteps[
+ skip_initial_inference_steps : len(timesteps) - skip_final_inference_steps
+ ]
+
+ if max_timestep < 1.0:
+ if max_timestep < timesteps.min():
+ raise ValueError(
+ f"max_timestep {max_timestep} is smaller than the minimum timestep {timesteps.min()}"
+ )
+ timesteps = timesteps[timesteps <= max_timestep]
+ num_inference_steps = len(timesteps)
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+
+ return timesteps, num_inference_steps
+
+
+@dataclass
+class ConditioningItem:
+ """
+ Defines a single frame-conditioning item - a single frame or a sequence of frames.
+
+ Attributes:
+ media_item (torch.Tensor): shape=(b, 3, f, h, w). The media item to condition on.
+ media_frame_number (int): The start-frame number of the media item in the generated video.
+ conditioning_strength (float): The strength of the conditioning (1.0 = full conditioning).
+ media_x (Optional[int]): Optional left x coordinate of the media item in the generated frame.
+ media_y (Optional[int]): Optional top y coordinate of the media item in the generated frame.
+ """
+
+ media_item: torch.Tensor
+ media_frame_number: int
+ conditioning_strength: float
+ media_x: Optional[int] = None
+ media_y: Optional[int] = None
+
+
+class LTXVideoPipeline(DiffusionPipeline):
+ r"""
+ Pipeline for text-to-image generation using LTX-Video.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`T5EncoderModel`]):
+ Frozen text-encoder. This uses
+ [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+ [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+ tokenizer (`T5Tokenizer`):
+ Tokenizer of class
+ [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+ transformer ([`Transformer2DModel`]):
+ A text conditioned `Transformer2DModel` to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+ """
+
+ bad_punct_regex = re.compile(
+ r"["
+ + "#®•©™&@·º½¾¿¡§~"
+ + r"\)"
+ + r"\("
+ + r"\]"
+ + r"\["
+ + r"\}"
+ + r"\{"
+ + r"\|"
+ + "\\"
+ + r"\/"
+ + r"\*"
+ + r"]{1,}"
+ ) # noqa
+
+ _optional_components = [
+ "tokenizer",
+ "text_encoder",
+ "prompt_enhancer_image_caption_model",
+ "prompt_enhancer_image_caption_processor",
+ "prompt_enhancer_llm_model",
+ "prompt_enhancer_llm_tokenizer",
+ ]
+ model_cpu_offload_seq = "prompt_enhancer_image_caption_model->prompt_enhancer_llm_model->text_encoder->transformer->vae"
+
+ def __init__(
+ self,
+ tokenizer: T5Tokenizer,
+ text_encoder: T5EncoderModel,
+ vae: AutoencoderKL,
+ transformer: Transformer3DModel,
+ scheduler: DPMSolverMultistepScheduler,
+ patchifier: Patchifier,
+ prompt_enhancer_image_caption_model: AutoModelForCausalLM,
+ prompt_enhancer_image_caption_processor: AutoProcessor,
+ prompt_enhancer_llm_model: AutoModelForCausalLM,
+ prompt_enhancer_llm_tokenizer: AutoTokenizer,
+ allowed_inference_steps: Optional[List[float]] = None,
+ ):
+ super().__init__()
+
+ self.register_modules(
+ tokenizer=tokenizer,
+ text_encoder=text_encoder,
+ vae=vae,
+ transformer=transformer,
+ scheduler=scheduler,
+ patchifier=patchifier,
+ prompt_enhancer_image_caption_model=prompt_enhancer_image_caption_model,
+ prompt_enhancer_image_caption_processor=prompt_enhancer_image_caption_processor,
+ prompt_enhancer_llm_model=prompt_enhancer_llm_model,
+ prompt_enhancer_llm_tokenizer=prompt_enhancer_llm_tokenizer,
+ )
+
+ self.video_scale_factor, self.vae_scale_factor, _ = get_vae_size_scale_factor(
+ self.vae
+ )
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+ self.allowed_inference_steps = allowed_inference_steps
+
+ def mask_text_embeddings(self, emb, mask):
+ if emb.shape[0] == 1:
+ keep_index = mask.sum().item()
+ return emb[:, :, :keep_index, :], keep_index
+ else:
+ masked_feature = emb * mask[:, None, :, None]
+ return masked_feature, emb.shape[2]
+
+ # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
+ def encode_prompt(
+ self,
+ prompt: Union[str, List[str]],
+ do_classifier_free_guidance: bool = True,
+ negative_prompt: str = "",
+ num_images_per_prompt: int = 1,
+ device: Optional[torch.device] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ text_encoder_max_tokens: int = 256,
+ **kwargs,
+ ):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ prompt to be encoded
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
+ instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
+ This should be "".
+ do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+ whether to use classifier free guidance or not
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ number of images that should be generated per prompt
+ device: (`torch.device`, *optional*):
+ torch device to place the resulting embeddings on
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings.
+ """
+
+ if "mask_feature" in kwargs:
+ deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+ deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
+
+ if device is None:
+ device = self._execution_device
+
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ # See Section 3.1. of the paper.
+ max_length = (
+ text_encoder_max_tokens # TPU supports only lengths multiple of 128
+ )
+ if prompt_embeds is None:
+ assert (
+ self.text_encoder is not None
+ ), "You should provide either prompt_embeds or self.text_encoder should not be None,"
+ text_enc_device = next(self.text_encoder.parameters()).device
+ prompt = self._text_preprocessing(prompt)
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ add_special_tokens=True,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+ untruncated_ids = self.tokenizer(
+ prompt, padding="longest", return_tensors="pt"
+ ).input_ids
+
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[
+ -1
+ ] and not torch.equal(text_input_ids, untruncated_ids):
+ removed_text = self.tokenizer.batch_decode(
+ untruncated_ids[:, max_length - 1 : -1]
+ )
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {max_length} tokens: {removed_text}"
+ )
+
+ prompt_attention_mask = text_inputs.attention_mask
+ prompt_attention_mask = prompt_attention_mask.to(text_enc_device)
+ prompt_attention_mask = prompt_attention_mask.to(device)
+
+ prompt_embeds = self.text_encoder(
+ text_input_ids.to(text_enc_device), attention_mask=prompt_attention_mask
+ )
+ prompt_embeds = prompt_embeds[0]
+
+ if self.text_encoder is not None:
+ dtype = self.text_encoder.dtype
+ elif self.transformer is not None:
+ dtype = self.transformer.dtype
+ else:
+ dtype = None
+
+ prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+ bs_embed, seq_len, _ = prompt_embeds.shape
+ # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+ prompt_embeds = prompt_embeds.view(
+ bs_embed * num_images_per_prompt, seq_len, -1
+ )
+ prompt_attention_mask = prompt_attention_mask.repeat(1, num_images_per_prompt)
+ prompt_attention_mask = prompt_attention_mask.view(
+ bs_embed * num_images_per_prompt, -1
+ )
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
+ uncond_tokens = self._text_preprocessing(negative_prompt)
+ uncond_tokens = uncond_tokens * batch_size
+ max_length = prompt_embeds.shape[1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_attention_mask=True,
+ add_special_tokens=True,
+ return_tensors="pt",
+ )
+ negative_prompt_attention_mask = uncond_input.attention_mask
+ negative_prompt_attention_mask = negative_prompt_attention_mask.to(
+ text_enc_device
+ )
+
+ negative_prompt_embeds = self.text_encoder(
+ uncond_input.input_ids.to(text_enc_device),
+ attention_mask=negative_prompt_attention_mask,
+ )
+ negative_prompt_embeds = negative_prompt_embeds[0]
+
+ if do_classifier_free_guidance:
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = negative_prompt_embeds.shape[1]
+
+ negative_prompt_embeds = negative_prompt_embeds.to(
+ dtype=dtype, device=device
+ )
+
+ negative_prompt_embeds = negative_prompt_embeds.repeat(
+ 1, num_images_per_prompt, 1
+ )
+ negative_prompt_embeds = negative_prompt_embeds.view(
+ batch_size * num_images_per_prompt, seq_len, -1
+ )
+
+ negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(
+ 1, num_images_per_prompt
+ )
+ negative_prompt_attention_mask = negative_prompt_attention_mask.view(
+ bs_embed * num_images_per_prompt, -1
+ )
+ else:
+ negative_prompt_embeds = None
+ negative_prompt_attention_mask = None
+
+ return (
+ prompt_embeds,
+ prompt_attention_mask,
+ negative_prompt_embeds,
+ negative_prompt_attention_mask,
+ )
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+ def prepare_extra_step_kwargs(self, generator, eta):
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+
+ accepts_eta = "eta" in set(
+ inspect.signature(self.scheduler.step).parameters.keys()
+ )
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(
+ inspect.signature(self.scheduler.step).parameters.keys()
+ )
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+ return extra_step_kwargs
+
+ def check_inputs(
+ self,
+ prompt,
+ height,
+ width,
+ negative_prompt,
+ prompt_embeds=None,
+ negative_prompt_embeds=None,
+ prompt_attention_mask=None,
+ negative_prompt_attention_mask=None,
+ enhance_prompt=False,
+ ):
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(
+ f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+ )
+
+ if prompt is not None and prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+ " only forward one of the two."
+ )
+ elif prompt is None and prompt_embeds is None:
+ raise ValueError(
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+ )
+ elif prompt is not None and (
+ not isinstance(prompt, str) and not isinstance(prompt, list)
+ ):
+ raise ValueError(
+ f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+ )
+
+ if prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if negative_prompt is not None and negative_prompt_embeds is not None:
+ raise ValueError(
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ )
+
+ if prompt_embeds is not None and prompt_attention_mask is None:
+ raise ValueError(
+ "Must provide `prompt_attention_mask` when specifying `prompt_embeds`."
+ )
+
+ if (
+ negative_prompt_embeds is not None
+ and negative_prompt_attention_mask is None
+ ):
+ raise ValueError(
+ "Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`."
+ )
+
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
+ raise ValueError(
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+ f" {negative_prompt_embeds.shape}."
+ )
+ if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+ raise ValueError(
+ "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+ f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+ f" {negative_prompt_attention_mask.shape}."
+ )
+
+ if enhance_prompt:
+ assert (
+ self.prompt_enhancer_image_caption_model is not None
+ ), "Image caption model must be initialized if enhance_prompt is True"
+ assert (
+ self.prompt_enhancer_image_caption_processor is not None
+ ), "Image caption processor must be initialized if enhance_prompt is True"
+ assert (
+ self.prompt_enhancer_llm_model is not None
+ ), "Text prompt enhancer model must be initialized if enhance_prompt is True"
+ assert (
+ self.prompt_enhancer_llm_tokenizer is not None
+ ), "Text prompt enhancer tokenizer must be initialized if enhance_prompt is True"
+
+ def _text_preprocessing(self, text):
+ if not isinstance(text, (tuple, list)):
+ text = [text]
+
+ def process(text: str):
+ text = text.strip()
+ return text
+
+ return [process(t) for t in text]
+
+ @staticmethod
+ def add_noise_to_image_conditioning_latents(
+ t: float,
+ init_latents: torch.Tensor,
+ latents: torch.Tensor,
+ noise_scale: float,
+ conditioning_mask: torch.Tensor,
+ generator,
+ eps=1e-6,
+ ):
+ """
+ Add timestep-dependent noise to the hard-conditioning latents.
+ This helps with motion continuity, especially when conditioned on a single frame.
+ """
+ noise = randn_tensor(
+ latents.shape,
+ generator=generator,
+ device=latents.device,
+ dtype=latents.dtype,
+ )
+ # Add noise only to hard-conditioning latents (conditioning_mask = 1.0)
+ need_to_noise = (conditioning_mask > 1.0 - eps).unsqueeze(-1)
+ noised_latents = init_latents + noise_scale * noise * (t**2)
+ latents = torch.where(need_to_noise, noised_latents, latents)
+ return latents
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+ def prepare_latents(
+ self,
+ latents: torch.Tensor | None,
+ media_items: torch.Tensor | None,
+ timestep: float,
+ latent_shape: torch.Size | Tuple[Any, ...],
+ dtype: torch.dtype,
+ device: torch.device,
+ generator: torch.Generator | List[torch.Generator],
+ vae_per_channel_normalize: bool = True,
+ ):
+ """
+ Prepare the initial latent tensor to be denoised.
+ The latents are either pure noise or a noised version of the encoded media items.
+ Args:
+ latents (`torch.FloatTensor` or `None`):
+ The latents to use (provided by the user) or `None` to create new latents.
+ media_items (`torch.FloatTensor` or `None`):
+ An image or video to be updated using img2img or vid2vid. The media item is encoded and noised.
+ timestep (`float`):
+ The timestep to noise the encoded media_items to.
+ latent_shape (`torch.Size`):
+ The target latent shape.
+ dtype (`torch.dtype`):
+ The target dtype.
+ device (`torch.device`):
+ The target device.
+ generator (`torch.Generator` or `List[torch.Generator]`):
+ Generator(s) to be used for the noising process.
+ vae_per_channel_normalize ('bool'):
+ When encoding the media_items, whether to normalize the latents per-channel.
+ Returns:
+ `torch.FloatTensor`: The latents to be used for the denoising process. This is a tensor of shape
+ (batch_size, num_channels, height, width).
+ """
+ if isinstance(generator, list) and len(generator) != latent_shape[0]:
+ raise ValueError(
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+ f" size of {latent_shape[0]}. Make sure the batch size matches the length of the generators."
+ )
+
+ # Initialize the latents with the given latents or encoded media item, if provided
+ assert (
+ latents is None or media_items is None
+ ), "Cannot provide both latents and media_items. Please provide only one of the two."
+
+ assert (
+ latents is None and media_items is None or timestep < 1.0
+ ), "Input media_item or latents are provided, but they will be replaced with noise."
+
+ if media_items is not None:
+ latents = vae_encode(
+ media_items.to(dtype=self.vae.dtype, device=self.vae.device),
+ self.vae,
+ vae_per_channel_normalize=vae_per_channel_normalize,
+ )
+ if latents is not None:
+ assert (
+ latents.shape == latent_shape
+ ), f"Latents have to be of shape {latent_shape} but are {latents.shape}."
+ latents = latents.to(device=device, dtype=dtype)
+
+ # For backward compatibility, generate in the "patchified" shape and rearrange
+ b, c, f, h, w = latent_shape
+ noise = randn_tensor(
+ (b, f * h * w, c), generator=generator, device=device, dtype=dtype
+ )
+ noise = rearrange(noise, "b (f h w) c -> b c f h w", f=f, h=h, w=w)
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ noise = noise * self.scheduler.init_noise_sigma
+
+ if latents is None:
+ latents = noise
+ else:
+ # Noise the latents to the required (first) timestep
+ latents = timestep * noise + (1 - timestep) * latents
+
+ return latents
+
+ @staticmethod
+ def classify_height_width_bin(
+ height: int, width: int, ratios: dict
+ ) -> Tuple[int, int]:
+ """Returns binned height and width."""
+ ar = float(height / width)
+ closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
+ default_hw = ratios[closest_ratio]
+ return int(default_hw[0]), int(default_hw[1])
+
+ @staticmethod
+ def resize_and_crop_tensor(
+ samples: torch.Tensor, new_width: int, new_height: int
+ ) -> torch.Tensor:
+ n_frames, orig_height, orig_width = samples.shape[-3:]
+
+ # Check if resizing is needed
+ if orig_height != new_height or orig_width != new_width:
+ ratio = max(new_height / orig_height, new_width / orig_width)
+ resized_width = int(orig_width * ratio)
+ resized_height = int(orig_height * ratio)
+
+ # Resize
+ samples = LTXVideoPipeline.resize_tensor(
+ samples, resized_height, resized_width
+ )
+
+ # Center Crop
+ start_x = (resized_width - new_width) // 2
+ end_x = start_x + new_width
+ start_y = (resized_height - new_height) // 2
+ end_y = start_y + new_height
+ samples = samples[..., start_y:end_y, start_x:end_x]
+
+ return samples
+
+ @staticmethod
+ def resize_tensor(media_items, height, width):
+ n_frames = media_items.shape[2]
+ if media_items.shape[-2:] != (height, width):
+ media_items = rearrange(media_items, "b c n h w -> (b n) c h w")
+ media_items = F.interpolate(
+ media_items,
+ size=(height, width),
+ mode="bilinear",
+ align_corners=False,
+ )
+ media_items = rearrange(media_items, "(b n) c h w -> b c n h w", n=n_frames)
+ return media_items
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ height: int,
+ width: int,
+ num_frames: int,
+ frame_rate: float,
+ prompt: Union[str, List[str]] = None,
+ negative_prompt: str = None,
+ num_inference_steps: int = 20,
+ timesteps: List[int] = None,
+ guidance_scale: Union[float, List[float]] = 4.5,
+ skip_layer_strategy: Optional[SkipLayerStrategy] = None,
+ skip_block_list: Optional[Union[List[List[int]], List[int]]] = None,
+ stg_scale: Union[float, List[float]] = 1.0,
+ rescaling_scale: Union[float, List[float]] = 0.7,
+ guidance_timesteps: Optional[List[int]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ prompt_embeds: Optional[torch.FloatTensor] = None,
+ prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+ negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+ conditioning_items: Optional[List[ConditioningItem]] = None,
+ decode_timestep: Union[List[float], float] = 0.0,
+ decode_noise_scale: Optional[List[float]] = None,
+ mixed_precision: bool = False,
+ offload_to_cpu: bool = False,
+ enhance_prompt: bool = False,
+ text_encoder_max_tokens: int = 256,
+ stochastic_sampling: bool = False,
+ media_items: Optional[torch.Tensor] = None,
+ strength: Optional[float] = 1.0,
+ skip_initial_inference_steps: int = 0,
+ skip_final_inference_steps: int = 0,
+ joint_pass: bool = False,
+ pass_no: int = -1,
+ ltxv_model = None,
+ callback=None,
+ **kwargs,
+ ) -> Union[ImagePipelineOutput, Tuple]:
+ """
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+ instead.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+ less than `1`).
+ num_inference_steps (`int`, *optional*, defaults to 100):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference. If `timesteps` is provided, this parameter is ignored.
+ timesteps (`List[int]`, *optional*):
+ Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+ timesteps are used. Must be in descending order.
+ guidance_scale (`float`, *optional*, defaults to 4.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ height (`int`, *optional*, defaults to self.unet.config.sample_size):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to self.unet.config.sample_size):
+ The width in pixels of the generated image.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+ to make generation deterministic.
+ latents (`torch.FloatTensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+ provided, text embeddings will be generated from `prompt` input argument.
+ prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings.
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+ Pre-generated negative text embeddings. This negative prompt should be "". If not
+ provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+ negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
+ Pre-generated attention mask for negative text embeddings.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+ callback_on_step_end (`Callable`, *optional*):
+ A function that calls at the end of each denoising steps during the inference. The function is called
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+ `callback_on_step_end_tensor_inputs`.
+ use_resolution_binning (`bool` defaults to `True`):
+ If set to `True`, the requested height and width are first mapped to the closest resolutions using
+ `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
+ the requested resolution. Useful for generating non-square images.
+ enhance_prompt (`bool`, *optional*, defaults to `False`):
+ If set to `True`, the prompt is enhanced using a LLM model.
+ text_encoder_max_tokens (`int`, *optional*, defaults to `256`):
+ The maximum number of tokens to use for the text encoder.
+ stochastic_sampling (`bool`, *optional*, defaults to `False`):
+ If set to `True`, the sampling is stochastic. If set to `False`, the sampling is deterministic.
+ media_items ('torch.Tensor', *optional*):
+ The input media item used for image-to-image / video-to-video.
+ When provided, they will be noised according to 'strength' and then fully denoised.
+ strength ('floaty', *optional* defaults to 1.0):
+ The editing level in image-to-image / video-to-video. The provided input will be noised
+ to this level.
+ Examples:
+
+ Returns:
+ [`~pipelines.ImagePipelineOutput`] or `tuple`:
+ If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+ returned where the first element is a list with the generated images
+ """
+ if "mask_feature" in kwargs:
+ deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+ deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
+
+ is_video = kwargs.get("is_video", False)
+ self.check_inputs(
+ prompt,
+ height,
+ width,
+ negative_prompt,
+ prompt_embeds,
+ negative_prompt_embeds,
+ prompt_attention_mask,
+ negative_prompt_attention_mask,
+ )
+
+ # 2. Default height and width to transformer
+ if prompt is not None and isinstance(prompt, str):
+ batch_size = 1
+ elif prompt is not None and isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ batch_size = prompt_embeds.shape[0]
+
+ device = self._execution_device
+
+ self.video_scale_factor = self.video_scale_factor if is_video else 1
+ vae_per_channel_normalize = kwargs.get("vae_per_channel_normalize", True)
+ image_cond_noise_scale = kwargs.get("image_cond_noise_scale", 0.0)
+
+ latent_height = height // self.vae_scale_factor
+ latent_width = width // self.vae_scale_factor
+ latent_num_frames = num_frames // self.video_scale_factor
+ if isinstance(self.vae, CausalVideoAutoencoder) and is_video:
+ latent_num_frames += 1
+ latent_shape = (
+ batch_size * num_images_per_prompt,
+ self.transformer.config.in_channels,
+ latent_num_frames,
+ latent_height,
+ latent_width,
+ )
+
+ # Prepare the list of denoising time-steps
+
+ retrieve_timesteps_kwargs = {}
+ if isinstance(self.scheduler, TimestepShifter):
+ retrieve_timesteps_kwargs["samples_shape"] = latent_shape
+
+ assert strength == 1.0 or latents is not None or media_items is not None, (
+ "strength < 1 is used for image-to-image/video-to-video - "
+ "media_item or latents should be provided."
+ )
+
+ timesteps, num_inference_steps = retrieve_timesteps(
+ self.scheduler,
+ num_inference_steps,
+ device,
+ timesteps,
+ max_timestep=strength,
+ skip_initial_inference_steps=skip_initial_inference_steps,
+ skip_final_inference_steps=skip_final_inference_steps,
+ **retrieve_timesteps_kwargs,
+ )
+ if self.allowed_inference_steps is not None:
+ for timestep in [round(x, 4) for x in timesteps.tolist()]:
+ assert (
+ timestep in self.allowed_inference_steps
+ ), f"Invalid inference timestep {timestep}. Allowed timesteps are {self.allowed_inference_steps}."
+
+ if guidance_timesteps:
+ guidance_mapping = []
+ for timestep in timesteps:
+ indices = [
+ i for i, val in enumerate(guidance_timesteps) if val <= timestep
+ ]
+ # assert len(indices) > 0, f"No guidance timestep found for {timestep}"
+ guidance_mapping.append(
+ indices[0] if len(indices) > 0 else (len(guidance_timesteps) - 1)
+ )
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ if not isinstance(guidance_scale, List):
+ guidance_scale = [guidance_scale] * len(timesteps)
+ else:
+ guidance_scale = [
+ guidance_scale[guidance_mapping[i]] for i in range(len(timesteps))
+ ]
+
+ # For simplicity, we are using a constant num_conds for all timesteps, so we need to zero
+ # out cases where the guidance scale should not be applied.
+ guidance_scale = [x if x > 1.0 else 0.0 for x in guidance_scale]
+
+ if not isinstance(stg_scale, List):
+ stg_scale = [stg_scale] * len(timesteps)
+ else:
+ stg_scale = [stg_scale[guidance_mapping[i]] for i in range(len(timesteps))]
+
+ if not isinstance(rescaling_scale, List):
+ rescaling_scale = [rescaling_scale] * len(timesteps)
+ else:
+ rescaling_scale = [
+ rescaling_scale[guidance_mapping[i]] for i in range(len(timesteps))
+ ]
+
+ do_classifier_free_guidance = any(x > 1.0 for x in guidance_scale)
+ do_spatio_temporal_guidance = any(x > 0.0 for x in stg_scale)
+ do_rescaling = any(x != 1.0 for x in rescaling_scale)
+
+ num_conds = 1
+ if do_classifier_free_guidance:
+ num_conds += 1
+ if do_spatio_temporal_guidance:
+ num_conds += 1
+
+ # Normalize skip_block_list to always be None or a list of lists matching timesteps
+ if skip_block_list is not None:
+ # Convert single list to list of lists if needed
+ if len(skip_block_list) == 0 or not isinstance(skip_block_list[0], list):
+ skip_block_list = [skip_block_list] * len(timesteps)
+ else:
+ new_skip_block_list = []
+ for i, timestep in enumerate(timesteps):
+ new_skip_block_list.append(skip_block_list[guidance_mapping[i]])
+ skip_block_list = new_skip_block_list
+
+ # Prepare skip layer masks
+ skip_layer_masks: Optional[List[torch.Tensor]] = None
+ if do_spatio_temporal_guidance:
+ if skip_block_list is not None:
+ skip_layer_masks = [
+ self.transformer.create_skip_layer_mask(
+ batch_size, num_conds, num_conds - 1, skip_blocks
+ )
+ for skip_blocks in skip_block_list
+ ]
+
+
+ # if offload_to_cpu and self.text_encoder is not None:
+ # self.text_encoder = self.text_encoder.cpu()
+
+ # self.transformer = self.transformer.to(self._execution_device)
+
+ prompt_embeds_batch = prompt_embeds
+ prompt_attention_mask_batch = prompt_attention_mask
+ if do_classifier_free_guidance:
+ prompt_embeds_batch = torch.cat(
+ [negative_prompt_embeds, prompt_embeds], dim=0
+ )
+ prompt_attention_mask_batch = torch.cat(
+ [negative_prompt_attention_mask.to("cuda"), prompt_attention_mask], dim=0
+ )
+ if do_spatio_temporal_guidance:
+ prompt_embeds_batch = torch.cat([prompt_embeds_batch, prompt_embeds], dim=0)
+ prompt_attention_mask_batch = torch.cat(
+ [
+ prompt_attention_mask_batch,
+ prompt_attention_mask,
+ ],
+ dim=0,
+ )
+
+ # 4. Prepare the initial latents using the provided media and conditioning items
+
+ # Prepare the initial latents tensor, shape = (b, c, f, h, w)
+ latents = self.prepare_latents(
+ latents=latents,
+ media_items=media_items,
+ timestep=timesteps[0],
+ latent_shape=latent_shape,
+ dtype=torch.float32 if mixed_precision else prompt_embeds_batch.dtype,
+ device=device,
+ generator=generator,
+ vae_per_channel_normalize=vae_per_channel_normalize,
+ )
+
+ # Update the latents with the conditioning items and patchify them into (b, n, c)
+ latents, pixel_coords, conditioning_mask, num_cond_latents = (
+ self.prepare_conditioning(
+ conditioning_items=conditioning_items,
+ init_latents=latents,
+ num_frames=num_frames,
+ height=height,
+ width=width,
+ vae_per_channel_normalize=vae_per_channel_normalize,
+ generator=generator,
+ )
+ )
+ init_latents = latents.clone() # Used for image_cond_noise_update
+
+ # pixel_coords = torch.cat([pixel_coords] * num_conds)
+ orig_conditioning_mask = conditioning_mask
+ if conditioning_mask is not None and is_video:
+ assert num_images_per_prompt == 1
+ conditioning_mask = torch.cat([conditioning_mask] * num_conds)
+ fractional_coords = pixel_coords.to(torch.float32)
+ fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
+ freqs_cis = self.transformer.precompute_freqs_cis(fractional_coords)
+
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+ # 7. Denoising loop
+ num_warmup_steps = max(
+ len(timesteps) - num_inference_steps * self.scheduler.order, 0
+ )
+ cfg_star_rescale = True
+
+
+ if callback != None:
+ callback(-1, None, True, override_num_inference_steps = num_inference_steps, pass_no =pass_no)
+
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
+ for i, t in enumerate(timesteps):
+ if conditioning_mask is not None and image_cond_noise_scale > 0.0:
+ latents = self.add_noise_to_image_conditioning_latents(
+ t,
+ init_latents,
+ latents,
+ image_cond_noise_scale,
+ orig_conditioning_mask,
+ generator,
+ )
+
+ latent_model_input = (
+ torch.cat([latents] * num_conds) if num_conds > 1 else latents
+ )
+ latent_model_input = self.scheduler.scale_model_input(
+ latent_model_input, t
+ )
+
+ current_timestep = t
+ if not torch.is_tensor(current_timestep):
+ # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+ # This would be a good case for the `match` statement (Python 3.10+)
+ is_mps = latent_model_input.device.type == "mps"
+ if isinstance(current_timestep, float):
+ dtype = torch.float32 if is_mps else torch.float64
+ else:
+ dtype = torch.int32 if is_mps else torch.int64
+ current_timestep = torch.tensor(
+ [current_timestep],
+ dtype=dtype,
+ device=latent_model_input.device,
+ )
+ elif len(current_timestep.shape) == 0:
+ current_timestep = current_timestep[None].to(
+ latent_model_input.device
+ )
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+ current_timestep = current_timestep.expand(
+ latent_model_input.shape[0]
+ ).unsqueeze(-1)
+
+ if conditioning_mask is not None:
+ # Conditioning latents have an initial timestep and noising level of (1.0 - conditioning_mask)
+ # and will start to be denoised when the current timestep is lower than their conditioning timestep.
+ current_timestep = torch.min(
+ current_timestep, 1.0 - conditioning_mask
+ )
+
+ # Choose the appropriate context manager based on `mixed_precision`
+ if mixed_precision:
+ context_manager = torch.autocast(device.type, dtype=self.transformer.dtype)
+ else:
+ context_manager = nullcontext() # Dummy context manager
+
+ # predict noise model_output
+ with context_manager:
+ noise_pred = self.transformer(
+ latent_model_input.to(self.transformer.dtype),
+ freqs_cis=freqs_cis,
+ encoder_hidden_states=prompt_embeds_batch.to(
+ self.transformer.dtype
+ ),
+ encoder_attention_mask=prompt_attention_mask_batch,
+ timestep=current_timestep,
+ skip_layer_mask=(
+ skip_layer_masks[i]
+ if skip_layer_masks is not None
+ else None
+ ),
+ skip_layer_strategy=skip_layer_strategy,
+ latent_shape = latent_shape[2:],
+ joint_pass = joint_pass,
+ ltxv_model = ltxv_model,
+ mixed = mixed_precision,
+ return_dict=False,
+ )[0]
+ if noise_pred == None:
+ return None
+ # perform guidance
+ if do_spatio_temporal_guidance:
+ noise_pred_text, noise_pred_text_perturb = noise_pred.chunk(
+ num_conds
+ )[-2:]
+ if do_classifier_free_guidance and guidance_scale[i] !=0 and guidance_scale[i] !=1 :
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(num_conds)[:2]
+ if cfg_star_rescale:
+ batch_size = noise_pred_text.shape[0]
+
+ positive_flat = noise_pred_text.view(batch_size, -1)
+ negative_flat = noise_pred_uncond.view(batch_size, -1)
+ dot_product = torch.sum(
+ positive_flat * negative_flat, dim=1, keepdim=True
+ )
+ squared_norm = torch.sum(negative_flat**2, dim=1, keepdim=True) + 1e-8
+ alpha = dot_product / squared_norm
+ noise_pred_uncond = alpha * noise_pred_uncond
+
+
+ noise_pred = noise_pred_uncond + guidance_scale[i] * (
+ noise_pred_text - noise_pred_uncond
+ )
+ elif do_spatio_temporal_guidance:
+ noise_pred = noise_pred_text
+ if do_spatio_temporal_guidance:
+ noise_pred = noise_pred + stg_scale[i] * (
+ noise_pred_text - noise_pred_text_perturb
+ )
+ if do_rescaling and stg_scale[i] > 0.0:
+ noise_pred_text_std = noise_pred_text.view(batch_size, -1).std(
+ dim=1, keepdim=True
+ )
+ noise_pred_std = noise_pred.view(batch_size, -1).std(
+ dim=1, keepdim=True
+ )
+
+ factor = noise_pred_text_std / noise_pred_std
+ factor = rescaling_scale[i] * factor + (1 - rescaling_scale[i])
+
+ noise_pred = noise_pred * factor.view(batch_size, 1, 1)
+
+ current_timestep = current_timestep[:1]
+ # learned sigma
+ if (
+ self.transformer.config.out_channels // 2
+ == self.transformer.config.in_channels
+ ):
+ noise_pred = noise_pred.chunk(2, dim=1)[0]
+
+ # compute previous image: x_t -> x_t-1
+ latents = self.denoising_step(
+ latents,
+ noise_pred,
+ current_timestep,
+ orig_conditioning_mask,
+ t,
+ extra_step_kwargs,
+ stochastic_sampling=stochastic_sampling,
+ )
+
+ if callback is not None:
+ # callback(i, None, False, pass_no =pass_no)
+ preview_latents= latents.squeeze(0).transpose(0, 1)
+ preview_latents= preview_latents.reshape(preview_latents.shape[0], latent_num_frames, latent_height, latent_width)
+ callback(i, preview_latents, False, pass_no =pass_no)
+ preview_latents = None
+ # call the callback, if provided
+ if i == len(timesteps) - 1 or (
+ (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+ ):
+ progress_bar.update()
+
+ if callback_on_step_end is not None:
+ callback_on_step_end(self, i, t, {})
+
+
+ # Remove the added conditioning latents
+ latents = latents[:, num_cond_latents:]
+
+ latents = self.patchifier.unpatchify(
+ latents=latents,
+ output_height=latent_height,
+ output_width=latent_width,
+ out_channels=self.transformer.in_channels
+ // math.prod(self.patchifier.patch_size),
+ )
+ if output_type != "latent":
+ if self.vae.decoder.timestep_conditioning:
+ noise = torch.randn_like(latents)
+ if not isinstance(decode_timestep, list):
+ decode_timestep = [decode_timestep] * latents.shape[0]
+ if decode_noise_scale is None:
+ decode_noise_scale = decode_timestep
+ elif not isinstance(decode_noise_scale, list):
+ decode_noise_scale = [decode_noise_scale] * latents.shape[0]
+
+ decode_timestep = torch.tensor(decode_timestep).to(latents.device)
+ decode_noise_scale = torch.tensor(decode_noise_scale).to(
+ latents.device
+ )[:, None, None, None, None]
+ latents = (
+ latents * (1 - decode_noise_scale) + noise * decode_noise_scale
+ )
+ else:
+ decode_timestep = None
+ torch.save(latents, "lala.pt")
+ # latents = torch.load("lala.pt")
+ image = vae_decode(
+ latents,
+ self.vae,
+ is_video,
+ vae_per_channel_normalize=kwargs["vae_per_channel_normalize"],
+ timestep=decode_timestep,
+ )
+
+ image = self.image_processor.postprocess(image, output_type=output_type)
+
+ else:
+ image = latents
+
+
+ if not return_dict:
+ return (image,)
+
+ return image
+
+ def denoising_step(
+ self,
+ latents: torch.Tensor,
+ noise_pred: torch.Tensor,
+ current_timestep: torch.Tensor,
+ conditioning_mask: torch.Tensor,
+ t: float,
+ extra_step_kwargs,
+ t_eps=1e-6,
+ stochastic_sampling=False,
+ ):
+ """
+ Perform the denoising step for the required tokens, based on the current timestep and
+ conditioning mask:
+ Conditioning latents have an initial timestep and noising level of (1.0 - conditioning_mask)
+ and will start to be denoised when the current timestep is equal or lower than their
+ conditioning timestep.
+ (hard-conditioning latents with conditioning_mask = 1.0 are never denoised)
+ """
+ # Denoise the latents using the scheduler
+ denoised_latents = self.scheduler.step(
+ noise_pred,
+ t if current_timestep is None else current_timestep,
+ latents,
+ **extra_step_kwargs,
+ return_dict=False,
+ stochastic_sampling=stochastic_sampling,
+ )[0]
+
+ if conditioning_mask is None:
+ return denoised_latents
+
+ tokens_to_denoise_mask = (t - t_eps < (1.0 - conditioning_mask)).unsqueeze(-1)
+ return torch.where(tokens_to_denoise_mask, denoised_latents, latents)
+
+ def prepare_conditioning(
+ self,
+ conditioning_items: Optional[List[ConditioningItem]],
+ init_latents: torch.Tensor,
+ num_frames: int,
+ height: int,
+ width: int,
+ vae_per_channel_normalize: bool = False,
+ generator=None,
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
+ """
+ Prepare conditioning tokens based on the provided conditioning items.
+
+ This method encodes provided conditioning items (video frames or single frames) into latents
+ and integrates them with the initial latent tensor. It also calculates corresponding pixel
+ coordinates, a mask indicating the influence of conditioning latents, and the total number of
+ conditioning latents.
+
+ Args:
+ conditioning_items (Optional[List[ConditioningItem]]): A list of ConditioningItem objects.
+ init_latents (torch.Tensor): The initial latent tensor of shape (b, c, f_l, h_l, w_l), where
+ `f_l` is the number of latent frames, and `h_l` and `w_l` are latent spatial dimensions.
+ num_frames, height, width: The dimensions of the generated video.
+ vae_per_channel_normalize (bool, optional): Whether to normalize channels during VAE encoding.
+ Defaults to `False`.
+ generator: The random generator
+
+ Returns:
+ Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
+ - `init_latents` (torch.Tensor): The updated latent tensor including conditioning latents,
+ patchified into (b, n, c) shape.
+ - `init_pixel_coords` (torch.Tensor): The pixel coordinates corresponding to the updated
+ latent tensor.
+ - `conditioning_mask` (torch.Tensor): A mask indicating the conditioning-strength of each
+ latent token.
+ - `num_cond_latents` (int): The total number of latent tokens added from conditioning items.
+
+ Raises:
+ AssertionError: If input shapes, dimensions, or conditions for applying conditioning are invalid.
+ """
+ assert isinstance(self.vae, CausalVideoAutoencoder)
+
+ if conditioning_items:
+ batch_size, _, num_latent_frames = init_latents.shape[:3]
+
+ init_conditioning_mask = torch.zeros(
+ init_latents[:, 0, :, :, :].shape,
+ dtype=torch.float32,
+ device=init_latents.device,
+ )
+
+ extra_conditioning_latents = []
+ extra_conditioning_pixel_coords = []
+ extra_conditioning_mask = []
+ extra_conditioning_num_latents = 0 # Number of extra conditioning latents added (should be removed before decoding)
+
+ # Process each conditioning item
+ for conditioning_item in conditioning_items:
+ conditioning_item = self._resize_conditioning_item(
+ conditioning_item, height, width
+ )
+ media_item = conditioning_item.media_item
+ media_frame_number = conditioning_item.media_frame_number
+ strength = conditioning_item.conditioning_strength
+ assert media_item.ndim == 5 # (b, c, f, h, w)
+ b, c, n_frames, h, w = media_item.shape
+ assert (
+ height == h and width == w
+ ) or media_frame_number == 0, f"Dimensions do not match: {height}x{width} != {h}x{w} - allowed only when media_frame_number == 0"
+ assert n_frames % 8 == 1
+ assert (
+ media_frame_number >= 0
+ and media_frame_number + n_frames <= num_frames
+ )
+
+ # Encode the provided conditioning media item
+ media_item_latents = vae_encode(
+ media_item.to(dtype=self.vae.dtype, device=self.vae.device),
+ self.vae,
+ vae_per_channel_normalize=vae_per_channel_normalize,
+ ).to(dtype=init_latents.dtype)
+
+ # Handle the different conditioning cases
+ if media_frame_number == 0:
+ # Get the target spatial position of the latent conditioning item
+ media_item_latents, l_x, l_y = self._get_latent_spatial_position(
+ media_item_latents,
+ conditioning_item,
+ height,
+ width,
+ strip_latent_border=True,
+ )
+ b, c_l, f_l, h_l, w_l = media_item_latents.shape
+
+ # First frame or sequence - just update the initial noise latents and the mask
+ init_latents[:, :, :f_l, l_y : l_y + h_l, l_x : l_x + w_l] = (
+ torch.lerp(
+ init_latents[:, :, :f_l, l_y : l_y + h_l, l_x : l_x + w_l],
+ media_item_latents,
+ strength,
+ )
+ )
+ init_conditioning_mask[
+ :, :f_l, l_y : l_y + h_l, l_x : l_x + w_l
+ ] = strength
+ else:
+ # Non-first frame or sequence
+ if n_frames > 1:
+ # Handle non-first sequence.
+ # Encoded latents are either fully consumed, or the prefix is handled separately below.
+ (
+ init_latents,
+ init_conditioning_mask,
+ media_item_latents,
+ ) = self._handle_non_first_conditioning_sequence(
+ init_latents,
+ init_conditioning_mask,
+ media_item_latents,
+ media_frame_number,
+ strength,
+ )
+
+ # Single frame or sequence-prefix latents
+ if media_item_latents is not None:
+ noise = randn_tensor(
+ media_item_latents.shape,
+ generator=generator,
+ device=media_item_latents.device,
+ dtype=media_item_latents.dtype,
+ )
+
+ media_item_latents = torch.lerp(
+ noise, media_item_latents, strength
+ )
+
+ # Patchify the extra conditioning latents and calculate their pixel coordinates
+ media_item_latents, latent_coords = self.patchifier.patchify(
+ latents=media_item_latents
+ )
+ pixel_coords = latent_to_pixel_coords(
+ latent_coords,
+ self.vae,
+ causal_fix=self.transformer.config.causal_temporal_positioning,
+ )
+
+ # Update the frame numbers to match the target frame number
+ pixel_coords[:, 0] += media_frame_number
+ extra_conditioning_num_latents += media_item_latents.shape[1]
+
+ conditioning_mask = torch.full(
+ media_item_latents.shape[:2],
+ strength,
+ dtype=torch.float32,
+ device=init_latents.device,
+ )
+
+ extra_conditioning_latents.append(media_item_latents)
+ extra_conditioning_pixel_coords.append(pixel_coords)
+ extra_conditioning_mask.append(conditioning_mask)
+
+ # Patchify the updated latents and calculate their pixel coordinates
+ init_latents, init_latent_coords = self.patchifier.patchify(
+ latents=init_latents
+ )
+ init_pixel_coords = latent_to_pixel_coords(
+ init_latent_coords,
+ self.vae,
+ causal_fix=self.transformer.config.causal_temporal_positioning,
+ )
+
+ if not conditioning_items:
+ return init_latents, init_pixel_coords, None, 0
+
+ init_conditioning_mask, _ = self.patchifier.patchify(
+ latents=init_conditioning_mask.unsqueeze(1)
+ )
+ init_conditioning_mask = init_conditioning_mask.squeeze(-1)
+
+ if extra_conditioning_latents:
+ # Stack the extra conditioning latents, pixel coordinates and mask
+ init_latents = torch.cat([*extra_conditioning_latents, init_latents], dim=1)
+ init_pixel_coords = torch.cat(
+ [*extra_conditioning_pixel_coords, init_pixel_coords], dim=2
+ )
+ init_conditioning_mask = torch.cat(
+ [*extra_conditioning_mask, init_conditioning_mask], dim=1
+ )
+
+ if self.transformer.use_tpu_flash_attention:
+ # When flash attention is used, keep the original number of tokens by removing
+ # tokens from the end.
+ init_latents = init_latents[:, :-extra_conditioning_num_latents]
+ init_pixel_coords = init_pixel_coords[
+ :, :, :-extra_conditioning_num_latents
+ ]
+ init_conditioning_mask = init_conditioning_mask[
+ :, :-extra_conditioning_num_latents
+ ]
+
+ return (
+ init_latents,
+ init_pixel_coords,
+ init_conditioning_mask,
+ extra_conditioning_num_latents,
+ )
+
+ @staticmethod
+ def _resize_conditioning_item(
+ conditioning_item: ConditioningItem,
+ height: int,
+ width: int,
+ ):
+ if conditioning_item.media_x or conditioning_item.media_y:
+ raise ValueError(
+ "Provide media_item in the target size for spatial conditioning."
+ )
+ new_conditioning_item = copy.copy(conditioning_item)
+ new_conditioning_item.media_item = LTXVideoPipeline.resize_tensor(
+ conditioning_item.media_item, height, width
+ )
+ return new_conditioning_item
+
+ def _get_latent_spatial_position(
+ self,
+ latents: torch.Tensor,
+ conditioning_item: ConditioningItem,
+ height: int,
+ width: int,
+ strip_latent_border,
+ ):
+ """
+ Get the spatial position of the conditioning item in the latent space.
+ If requested, strip the conditioning latent borders that do not align with target borders.
+ (border latents look different then other latents and might confuse the model)
+ """
+ scale = self.vae_scale_factor
+ h, w = conditioning_item.media_item.shape[-2:]
+ assert (
+ h <= height and w <= width
+ ), f"Conditioning item size {h}x{w} is larger than target size {height}x{width}"
+ assert h % scale == 0 and w % scale == 0
+
+ # Compute the start and end spatial positions of the media item
+ x_start, y_start = conditioning_item.media_x, conditioning_item.media_y
+ x_start = (width - w) // 2 if x_start is None else x_start
+ y_start = (height - h) // 2 if y_start is None else y_start
+ x_end, y_end = x_start + w, y_start + h
+ assert (
+ x_end <= width and y_end <= height
+ ), f"Conditioning item {x_start}:{x_end}x{y_start}:{y_end} is out of bounds for target size {width}x{height}"
+
+ if strip_latent_border:
+ # Strip one latent from left/right and/or top/bottom, update x, y accordingly
+ if x_start > 0:
+ x_start += scale
+ latents = latents[:, :, :, :, 1:]
+
+ if y_start > 0:
+ y_start += scale
+ latents = latents[:, :, :, 1:, :]
+
+ if x_end < width:
+ latents = latents[:, :, :, :, :-1]
+
+ if y_end < height:
+ latents = latents[:, :, :, :-1, :]
+
+ return latents, x_start // scale, y_start // scale
+
+ @staticmethod
+ def _handle_non_first_conditioning_sequence(
+ init_latents: torch.Tensor,
+ init_conditioning_mask: torch.Tensor,
+ latents: torch.Tensor,
+ media_frame_number: int,
+ strength: float,
+ num_prefix_latent_frames: int = 2,
+ prefix_latents_mode: str = "concat",
+ prefix_soft_conditioning_strength: float = 0.15,
+ ):
+ """
+ Special handling for a conditioning sequence that does not start on the first frame.
+ The special handling is required to allow a short encoded video to be used as middle
+ (or last) sequence in a longer video.
+ Args:
+ init_latents (torch.Tensor): The initial noise latents to be updated.
+ init_conditioning_mask (torch.Tensor): The initial conditioning mask to be updated.
+ latents (torch.Tensor): The encoded conditioning item.
+ media_frame_number (int): The target frame number of the first frame in the conditioning sequence.
+ strength (float): The conditioning strength for the conditioning latents.
+ num_prefix_latent_frames (int, optional): The length of the sequence prefix, to be handled
+ separately. Defaults to 2.
+ prefix_latents_mode (str, optional): Special treatment for prefix (boundary) latents.
+ - "drop": Drop the prefix latents.
+ - "soft": Use the prefix latents, but with soft-conditioning
+ - "concat": Add the prefix latents as extra tokens (like single frames)
+ prefix_soft_conditioning_strength (float, optional): The strength of the soft-conditioning for
+ the prefix latents, relevant if `prefix_latents_mode` is "soft". Defaults to 0.1.
+
+ """
+ f_l = latents.shape[2]
+ f_l_p = num_prefix_latent_frames
+ assert f_l >= f_l_p
+ assert media_frame_number % 8 == 0
+ if f_l > f_l_p:
+ # Insert the conditioning latents **excluding the prefix** into the sequence
+ f_l_start = media_frame_number // 8 + f_l_p
+ f_l_end = f_l_start + f_l - f_l_p
+ init_latents[:, :, f_l_start:f_l_end] = torch.lerp(
+ init_latents[:, :, f_l_start:f_l_end],
+ latents[:, :, f_l_p:],
+ strength,
+ )
+ # Mark these latent frames as conditioning latents
+ init_conditioning_mask[:, f_l_start:f_l_end] = strength
+
+ # Handle the prefix-latents
+ if prefix_latents_mode == "soft":
+ if f_l_p > 1:
+ # Drop the first (single-frame) latent and soft-condition the remaining prefix
+ f_l_start = media_frame_number // 8 + 1
+ f_l_end = f_l_start + f_l_p - 1
+ strength = min(prefix_soft_conditioning_strength, strength)
+ init_latents[:, :, f_l_start:f_l_end] = torch.lerp(
+ init_latents[:, :, f_l_start:f_l_end],
+ latents[:, :, 1:f_l_p],
+ strength,
+ )
+ # Mark these latent frames as conditioning latents
+ init_conditioning_mask[:, f_l_start:f_l_end] = strength
+ latents = None # No more latents to handle
+ elif prefix_latents_mode == "drop":
+ # Drop the prefix latents
+ latents = None
+ elif prefix_latents_mode == "concat":
+ # Pass-on the prefix latents to be handled as extra conditioning frames
+ latents = latents[:, :, :f_l_p]
+ else:
+ raise ValueError(f"Invalid prefix_latents_mode: {prefix_latents_mode}")
+ return (
+ init_latents,
+ init_conditioning_mask,
+ latents,
+ )
+
+ def trim_conditioning_sequence(
+ self, start_frame: int, sequence_num_frames: int, target_num_frames: int
+ ):
+ """
+ Trim a conditioning sequence to the allowed number of frames.
+
+ Args:
+ start_frame (int): The target frame number of the first frame in the sequence.
+ sequence_num_frames (int): The number of frames in the sequence.
+ target_num_frames (int): The target number of frames in the generated video.
+
+ Returns:
+ int: updated sequence length
+ """
+ scale_factor = self.video_scale_factor
+ num_frames = min(sequence_num_frames, target_num_frames - start_frame)
+ # Trim down to a multiple of temporal_scale_factor frames plus 1
+ num_frames = (num_frames - 1) // scale_factor * scale_factor + 1
+ return num_frames
+
+def adain_filter_latent(
+ latents: torch.Tensor, reference_latents: torch.Tensor, factor=1.0
+):
+ """
+ Applies Adaptive Instance Normalization (AdaIN) to a latent tensor based on
+ statistics from a reference latent tensor.
+
+ Args:
+ latent (torch.Tensor): Input latents to normalize
+ reference_latent (torch.Tensor): The reference latents providing style statistics.
+ factor (float): Blending factor between original and transformed latent.
+ Range: -10.0 to 10.0, Default: 1.0
+
+ Returns:
+ torch.Tensor: The transformed latent tensor
+ """
+ result = latents.clone()
+
+ for i in range(latents.size(0)):
+ for c in range(latents.size(1)):
+ r_sd, r_mean = torch.std_mean(
+ reference_latents[i, c], dim=None
+ ) # index by original dim order
+ i_sd, i_mean = torch.std_mean(result[i, c], dim=None)
+
+ result[i, c] = ((result[i, c] - i_mean) / i_sd) * r_sd + r_mean
+
+ result = torch.lerp(latents, result, factor)
+ return result
+
+
+
+class LTXMultiScalePipeline:
+ @staticmethod
+ def batch_normalize(latents, reference, factor = 0.25):
+ latents_copy = latents.clone()
+ t = latents_copy # B x C x F x H x W
+
+ for i in range(t.size(0)): # batch
+ for c in range(t.size(1)): # channel
+ r_sd, r_mean = torch.std_mean(
+ reference[i, c], dim=None
+ ) # index by original dim order
+ i_sd, i_mean = torch.std_mean(t[i, c], dim=None)
+
+ t[i, c] = ((t[i, c] - i_mean) / i_sd) * r_sd + r_mean
+
+ latents_copy = torch.lerp(latents, t, factor)
+ return latents_copy
+
+
+ def _upsample_latents(
+ self, latest_upsampler: LatentUpsampler, latents: torch.Tensor
+ ):
+ # assert latents.device == latest_upsampler.device
+
+ latents = un_normalize_latents(
+ latents, self.vae, vae_per_channel_normalize=True
+ )
+ upsampled_latents = latest_upsampler(latents)
+ upsampled_latents = normalize_latents(
+ upsampled_latents, self.vae, vae_per_channel_normalize=True
+ )
+ return upsampled_latents
+
+
+ def __init__(
+ self, video_pipeline: LTXVideoPipeline, latent_upsampler: LatentUpsampler
+ ):
+ self.video_pipeline = video_pipeline
+ self.vae = video_pipeline.vae
+ self.latent_upsampler = latent_upsampler
+
+ def __call__(
+ self,
+ downscale_factor: float,
+ first_pass: dict,
+ second_pass: dict,
+ *args: Any,
+ **kwargs: Any,
+ ) -> Any:
+ video_pipeline = self.video_pipeline
+
+ original_kwargs = kwargs.copy()
+ original_output_type = kwargs["output_type"]
+ original_width = kwargs["width"]
+ original_height = kwargs["height"]
+
+ x_width = int(kwargs["width"] * downscale_factor)
+ downscaled_width = x_width - (x_width % self.video_pipeline.vae_scale_factor)
+ x_height = int(kwargs["height"] * downscale_factor)
+ downscaled_height = x_height - (x_height % self.video_pipeline.vae_scale_factor)
+ trans = video_pipeline.transformer
+ kwargs["output_type"] = "latent"
+ kwargs["width"] = downscaled_width
+ kwargs["height"] = downscaled_height
+
+
+ VAE_tile_size = kwargs["VAE_tile_size"]
+
+ z_tile, hw_tile = VAE_tile_size
+
+ if z_tile > 0:
+ self.vae.enable_z_tiling(z_tile)
+ if hw_tile > 0:
+ self.vae.enable_hw_tiling()
+ self.vae.set_tiling_params(hw_tile)
+
+ ltxv_model = kwargs["ltxv_model"]
+ text_encoder_max_tokens = 256
+ prompt = kwargs.pop("prompt")
+ negative_prompt = kwargs.pop("negative_prompt")
+ if False and kwargs["enhance_prompt"]:
+ prompt = generate_cinematic_prompt(
+ video_pipeline.prompt_enhancer_image_caption_model,
+ video_pipeline.prompt_enhancer_image_caption_processor,
+ video_pipeline.prompt_enhancer_llm_model,
+ video_pipeline.prompt_enhancer_llm_tokenizer,
+ prompt,
+ kwargs["conditioning_items"],
+ max_new_tokens=text_encoder_max_tokens,
+ )
+ print("Enhanced prompt: " + prompt[0])
+
+ # Encode input prompt
+
+ (
+ prompt_embeds,
+ prompt_attention_mask,
+ negative_prompt_embeds,
+ negative_prompt_attention_mask,
+ ) = video_pipeline.encode_prompt(
+ prompt,
+ True,
+ negative_prompt=negative_prompt,
+ device=kwargs["device"],
+ text_encoder_max_tokens=text_encoder_max_tokens,
+ )
+ if ltxv_model._interrupt:
+ return None
+
+ kwargs["prompt_embeds"] = prompt_embeds
+ kwargs["prompt_attention_mask"] = prompt_attention_mask
+ kwargs["negative_prompt_embeds"] = negative_prompt_embeds
+ kwargs["negative_prompt_attention_mask"] = negative_prompt_attention_mask
+
+ original_kwargs = kwargs.copy()
+
+ kwargs["joint_pass"] = True
+ kwargs["pass_no"] = 1
+
+
+ kwargs.update(**first_pass)
+ kwargs["num_inference_steps"] = kwargs["num_inference_steps1"]
+ result = video_pipeline(*args, **kwargs)
+ if result == None:
+ return None
+
+ latents = result
+
+ upsampled_latents = self._upsample_latents(self.latent_upsampler, latents)
+
+ upsampled_latents = adain_filter_latent(
+ latents=upsampled_latents, reference_latents=latents
+ )
+ # upsampled_latents = self.batch_normalize(upsampled_latents, latents)
+
+ kwargs = original_kwargs
+ kwargs["latents"] = upsampled_latents
+ kwargs["output_type"] = original_output_type
+ kwargs["width"] = downscaled_width * 2
+ kwargs["height"] = downscaled_height * 2
+ kwargs["joint_pass"] = False
+ kwargs["pass_no"] = 2
+
+ kwargs.update(**second_pass)
+ kwargs["num_inference_steps"] = kwargs["num_inference_steps2"]
+
+ result = video_pipeline(*args, **kwargs)
+ if result == None:
+ return None
+ if original_output_type != "latent":
+ num_frames = result.shape[2]
+ videos = rearrange(result, "b c f h w -> (b f) c h w")
+
+ videos = F.interpolate(
+ videos,
+ size=(original_height, original_width),
+ mode="bilinear",
+ align_corners=False,
+ )
+ videos = rearrange(videos, "(b f) c h w -> b c f h w", f=num_frames)
+ result = videos
+
+ return result
diff --git a/ltx_video/schedulers/__init__.py b/ltx_video/schedulers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ltx_video/schedulers/rf.py b/ltx_video/schedulers/rf.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cf99da8fbc88755761a348d7e27eb19e91bf6bb
--- /dev/null
+++ b/ltx_video/schedulers/rf.py
@@ -0,0 +1,392 @@
+import math
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Callable, Optional, Tuple, Union
+import json
+import os
+from pathlib import Path
+
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils import BaseOutput
+from torch import Tensor
+from safetensors import safe_open
+
+
+from ltx_video.utils.torch_utils import append_dims
+
+from ltx_video.utils.diffusers_config_mapping import (
+ diffusers_and_ours_config_mapping,
+ make_hashable_key,
+)
+
+
+def linear_quadratic_schedule(num_steps, threshold_noise=0.025, linear_steps=None):
+ if num_steps == 1:
+ return torch.tensor([1.0])
+ if linear_steps is None:
+ linear_steps = num_steps // 2
+ linear_sigma_schedule = [
+ i * threshold_noise / linear_steps for i in range(linear_steps)
+ ]
+ threshold_noise_step_diff = linear_steps - threshold_noise * num_steps
+ quadratic_steps = num_steps - linear_steps
+ quadratic_coef = threshold_noise_step_diff / (linear_steps * quadratic_steps**2)
+ linear_coef = threshold_noise / linear_steps - 2 * threshold_noise_step_diff / (
+ quadratic_steps**2
+ )
+ const = quadratic_coef * (linear_steps**2)
+ quadratic_sigma_schedule = [
+ quadratic_coef * (i**2) + linear_coef * i + const
+ for i in range(linear_steps, num_steps)
+ ]
+ sigma_schedule = linear_sigma_schedule + quadratic_sigma_schedule + [1.0]
+ sigma_schedule = [1.0 - x for x in sigma_schedule]
+ return torch.tensor(sigma_schedule[:-1])
+
+
+def simple_diffusion_resolution_dependent_timestep_shift(
+ samples_shape: torch.Size,
+ timesteps: Tensor,
+ n: int = 32 * 32,
+) -> Tensor:
+ if len(samples_shape) == 3:
+ _, m, _ = samples_shape
+ elif len(samples_shape) in [4, 5]:
+ m = math.prod(samples_shape[2:])
+ else:
+ raise ValueError(
+ "Samples must have shape (b, t, c), (b, c, h, w) or (b, c, f, h, w)"
+ )
+ snr = (timesteps / (1 - timesteps)) ** 2
+ shift_snr = torch.log(snr) + 2 * math.log(m / n)
+ shifted_timesteps = torch.sigmoid(0.5 * shift_snr)
+
+ return shifted_timesteps
+
+
+def time_shift(mu: float, sigma: float, t: Tensor):
+ return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+
+def get_normal_shift(
+ n_tokens: int,
+ min_tokens: int = 1024,
+ max_tokens: int = 4096,
+ min_shift: float = 0.95,
+ max_shift: float = 2.05,
+) -> Callable[[float], float]:
+ m = (max_shift - min_shift) / (max_tokens - min_tokens)
+ b = min_shift - m * min_tokens
+ return m * n_tokens + b
+
+
+def strech_shifts_to_terminal(shifts: Tensor, terminal=0.1):
+ """
+ Stretch a function (given as sampled shifts) so that its final value matches the given terminal value
+ using the provided formula.
+
+ Parameters:
+ - shifts (Tensor): The samples of the function to be stretched (PyTorch Tensor).
+ - terminal (float): The desired terminal value (value at the last sample).
+
+ Returns:
+ - Tensor: The stretched shifts such that the final value equals `terminal`.
+ """
+ if shifts.numel() == 0:
+ raise ValueError("The 'shifts' tensor must not be empty.")
+
+ # Ensure terminal value is valid
+ if terminal <= 0 or terminal >= 1:
+ raise ValueError("The terminal value must be between 0 and 1 (exclusive).")
+
+ # Transform the shifts using the given formula
+ one_minus_z = 1 - shifts
+ scale_factor = one_minus_z[-1] / (1 - terminal)
+ stretched_shifts = 1 - (one_minus_z / scale_factor)
+
+ return stretched_shifts
+
+
+def sd3_resolution_dependent_timestep_shift(
+ samples_shape: torch.Size,
+ timesteps: Tensor,
+ target_shift_terminal: Optional[float] = None,
+) -> Tensor:
+ """
+ Shifts the timestep schedule as a function of the generated resolution.
+
+ In the SD3 paper, the authors empirically how to shift the timesteps based on the resolution of the target images.
+ For more details: https://arxiv.org/pdf/2403.03206
+
+ In Flux they later propose a more dynamic resolution dependent timestep shift, see:
+ https://github.com/black-forest-labs/flux/blob/87f6fff727a377ea1c378af692afb41ae84cbe04/src/flux/sampling.py#L66
+
+
+ Args:
+ samples_shape (torch.Size): The samples batch shape (batch_size, channels, height, width) or
+ (batch_size, channels, frame, height, width).
+ timesteps (Tensor): A batch of timesteps with shape (batch_size,).
+ target_shift_terminal (float): The target terminal value for the shifted timesteps.
+
+ Returns:
+ Tensor: The shifted timesteps.
+ """
+ if len(samples_shape) == 3:
+ _, m, _ = samples_shape
+ elif len(samples_shape) in [4, 5]:
+ m = math.prod(samples_shape[2:])
+ else:
+ raise ValueError(
+ "Samples must have shape (b, t, c), (b, c, h, w) or (b, c, f, h, w)"
+ )
+
+ shift = get_normal_shift(m)
+ time_shifts = time_shift(shift, 1, timesteps)
+ if target_shift_terminal is not None: # Stretch the shifts to the target terminal
+ time_shifts = strech_shifts_to_terminal(time_shifts, target_shift_terminal)
+ return time_shifts
+
+
+class TimestepShifter(ABC):
+ @abstractmethod
+ def shift_timesteps(self, samples_shape: torch.Size, timesteps: Tensor) -> Tensor:
+ pass
+
+
+@dataclass
+class RectifiedFlowSchedulerOutput(BaseOutput):
+ """
+ Output class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+ `pred_original_sample` can be used to preview progress or for guidance.
+ """
+
+ prev_sample: torch.FloatTensor
+ pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+class RectifiedFlowScheduler(SchedulerMixin, ConfigMixin, TimestepShifter):
+ order = 1
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps=1000,
+ shifting: Optional[str] = None,
+ base_resolution: int = 32**2,
+ target_shift_terminal: Optional[float] = None,
+ sampler: Optional[str] = "Uniform",
+ shift: Optional[float] = None,
+ ):
+ super().__init__()
+ self.init_noise_sigma = 1.0
+ self.num_inference_steps = None
+ self.sampler = sampler
+ self.shifting = shifting
+ self.base_resolution = base_resolution
+ self.target_shift_terminal = target_shift_terminal
+ self.timesteps = self.sigmas = self.get_initial_timesteps(
+ num_train_timesteps, shift=shift
+ )
+ self.shift = shift
+
+ def get_initial_timesteps(
+ self, num_timesteps: int, shift: Optional[float] = None
+ ) -> Tensor:
+ if self.sampler == "Uniform":
+ return torch.linspace(1, 1 / num_timesteps, num_timesteps)
+ elif self.sampler == "LinearQuadratic":
+ return linear_quadratic_schedule(num_timesteps)
+ elif self.sampler == "Constant":
+ assert (
+ shift is not None
+ ), "Shift must be provided for constant time shift sampler."
+ return time_shift(
+ shift, 1, torch.linspace(1, 1 / num_timesteps, num_timesteps)
+ )
+
+ def shift_timesteps(self, samples_shape: torch.Size, timesteps: Tensor) -> Tensor:
+ if self.shifting == "SD3":
+ return sd3_resolution_dependent_timestep_shift(
+ samples_shape, timesteps, self.target_shift_terminal
+ )
+ elif self.shifting == "SimpleDiffusion":
+ return simple_diffusion_resolution_dependent_timestep_shift(
+ samples_shape, timesteps, self.base_resolution
+ )
+ return timesteps
+
+ def set_timesteps(
+ self,
+ num_inference_steps: Optional[int] = None,
+ samples_shape: Optional[torch.Size] = None,
+ timesteps: Optional[Tensor] = None,
+ device: Union[str, torch.device] = None,
+ ):
+ """
+ Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+ If `timesteps` are provided, they will be used instead of the scheduled timesteps.
+
+ Args:
+ num_inference_steps (`int` *optional*): The number of diffusion steps used when generating samples.
+ samples_shape (`torch.Size` *optional*): The samples batch shape, used for shifting.
+ timesteps ('torch.Tensor' *optional*): Specific timesteps to use instead of scheduled timesteps.
+ device (`Union[str, torch.device]`, *optional*): The device to which the timesteps tensor will be moved.
+ """
+ if timesteps is not None and num_inference_steps is not None:
+ raise ValueError(
+ "You cannot provide both `timesteps` and `num_inference_steps`."
+ )
+ if timesteps is None:
+ num_inference_steps = min(
+ self.config.num_train_timesteps, num_inference_steps
+ )
+ timesteps = self.get_initial_timesteps(
+ num_inference_steps, shift=self.shift
+ ).to(device)
+ timesteps = self.shift_timesteps(samples_shape, timesteps)
+ else:
+ timesteps = torch.Tensor(timesteps).to(device)
+ num_inference_steps = len(timesteps)
+ self.timesteps = timesteps
+ self.num_inference_steps = num_inference_steps
+ self.sigmas = self.timesteps
+
+ @staticmethod
+ def from_pretrained(pretrained_model_path: Union[str, os.PathLike]):
+ with open(pretrained_model_path, "r", encoding="utf-8") as reader:
+ text = reader.read()
+
+ config = json.loads(text)
+ return RectifiedFlowScheduler.from_config(config)
+
+ pretrained_model_path = Path(pretrained_model_path)
+ if pretrained_model_path.is_file():
+ comfy_single_file_state_dict = {}
+ with safe_open(pretrained_model_path, framework="pt", device="cpu") as f:
+ metadata = f.metadata()
+ for k in f.keys():
+ comfy_single_file_state_dict[k] = f.get_tensor(k)
+ configs = json.loads(metadata["config"])
+ config = configs["scheduler"]
+ del comfy_single_file_state_dict
+
+ elif pretrained_model_path.is_dir():
+ diffusers_noise_scheduler_config_path = (
+ pretrained_model_path / "scheduler" / "scheduler_config.json"
+ )
+
+ with open(diffusers_noise_scheduler_config_path, "r") as f:
+ scheduler_config = json.load(f)
+ hashable_config = make_hashable_key(scheduler_config)
+ if hashable_config in diffusers_and_ours_config_mapping:
+ config = diffusers_and_ours_config_mapping[hashable_config]
+ return RectifiedFlowScheduler.from_config(config)
+
+ def scale_model_input(
+ self, sample: torch.FloatTensor, timestep: Optional[int] = None
+ ) -> torch.FloatTensor:
+ # pylint: disable=unused-argument
+ """
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+ current timestep.
+
+ Args:
+ sample (`torch.FloatTensor`): input sample
+ timestep (`int`, optional): current timestep
+
+ Returns:
+ `torch.FloatTensor`: scaled input sample
+ """
+ return sample
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: torch.FloatTensor,
+ sample: torch.FloatTensor,
+ return_dict: bool = True,
+ stochastic_sampling: Optional[bool] = False,
+ **kwargs,
+ ) -> Union[RectifiedFlowSchedulerOutput, Tuple]:
+ """
+ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+ process from the learned model outputs (most often the predicted noise).
+ z_{t_1} = z_t - \Delta_t * v
+ The method finds the next timestep that is lower than the input timestep(s) and denoises the latents
+ to that level. The input timestep(s) are not required to be one of the predefined timesteps.
+
+ Args:
+ model_output (`torch.FloatTensor`):
+ The direct output from learned diffusion model - the velocity,
+ timestep (`float`):
+ The current discrete timestep in the diffusion chain (global or per-token).
+ sample (`torch.FloatTensor`):
+ A current latent tokens to be de-noised.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
+ stochastic_sampling (`bool`, *optional*, defaults to `False`):
+ Whether to use stochastic sampling for the sampling process.
+
+ Returns:
+ [`~schedulers.scheduling_utils.RectifiedFlowSchedulerOutput`] or `tuple`:
+ If return_dict is `True`, [`~schedulers.rf_scheduler.RectifiedFlowSchedulerOutput`] is returned,
+ otherwise a tuple is returned where the first element is the sample tensor.
+ """
+ if self.num_inference_steps is None:
+ raise ValueError(
+ "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+ )
+ t_eps = 1e-6 # Small epsilon to avoid numerical issues in timestep values
+
+ timesteps_padded = torch.cat(
+ [self.timesteps, torch.zeros(1, device=self.timesteps.device)]
+ )
+
+ # Find the next lower timestep(s) and compute the dt from the current timestep(s)
+ if timestep.ndim == 0:
+ # Global timestep case
+ lower_mask = timesteps_padded < timestep - t_eps
+ lower_timestep = timesteps_padded[lower_mask][0] # Closest lower timestep
+ dt = timestep - lower_timestep
+
+ else:
+ # Per-token case
+ assert timestep.ndim == 2
+ lower_mask = timesteps_padded[:, None, None] < timestep[None] - t_eps
+ lower_timestep = lower_mask * timesteps_padded[:, None, None]
+ lower_timestep, _ = lower_timestep.max(dim=0)
+ dt = (timestep - lower_timestep)[..., None]
+
+ # Compute previous sample
+ if stochastic_sampling:
+ x0 = sample - timestep[..., None] * model_output
+ next_timestep = timestep[..., None] - dt
+ prev_sample = self.add_noise(x0, torch.randn_like(sample), next_timestep)
+ else:
+ prev_sample = sample - dt * model_output
+
+ if not return_dict:
+ return (prev_sample,)
+
+ return RectifiedFlowSchedulerOutput(prev_sample=prev_sample)
+
+ def add_noise(
+ self,
+ original_samples: torch.FloatTensor,
+ noise: torch.FloatTensor,
+ timesteps: torch.FloatTensor,
+ ) -> torch.FloatTensor:
+ sigmas = timesteps
+ sigmas = append_dims(sigmas, original_samples.ndim)
+ alphas = 1 - sigmas
+ noisy_samples = alphas * original_samples + sigmas * noise
+ return noisy_samples
diff --git a/ltx_video/utils/__init__.py b/ltx_video/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ltx_video/utils/diffusers_config_mapping.py b/ltx_video/utils/diffusers_config_mapping.py
new file mode 100644
index 0000000000000000000000000000000000000000..53c0082d182617f6f84eab9c849f7ef0224becb8
--- /dev/null
+++ b/ltx_video/utils/diffusers_config_mapping.py
@@ -0,0 +1,174 @@
+def make_hashable_key(dict_key):
+ def convert_value(value):
+ if isinstance(value, list):
+ return tuple(value)
+ elif isinstance(value, dict):
+ return tuple(sorted((k, convert_value(v)) for k, v in value.items()))
+ else:
+ return value
+
+ return tuple(sorted((k, convert_value(v)) for k, v in dict_key.items()))
+
+
+DIFFUSERS_SCHEDULER_CONFIG = {
+ "_class_name": "FlowMatchEulerDiscreteScheduler",
+ "_diffusers_version": "0.32.0.dev0",
+ "base_image_seq_len": 1024,
+ "base_shift": 0.95,
+ "invert_sigmas": False,
+ "max_image_seq_len": 4096,
+ "max_shift": 2.05,
+ "num_train_timesteps": 1000,
+ "shift": 1.0,
+ "shift_terminal": 0.1,
+ "use_beta_sigmas": False,
+ "use_dynamic_shifting": True,
+ "use_exponential_sigmas": False,
+ "use_karras_sigmas": False,
+}
+DIFFUSERS_TRANSFORMER_CONFIG = {
+ "_class_name": "LTXVideoTransformer3DModel",
+ "_diffusers_version": "0.32.0.dev0",
+ "activation_fn": "gelu-approximate",
+ "attention_bias": True,
+ "attention_head_dim": 64,
+ "attention_out_bias": True,
+ "caption_channels": 4096,
+ "cross_attention_dim": 2048,
+ "in_channels": 128,
+ "norm_elementwise_affine": False,
+ "norm_eps": 1e-06,
+ "num_attention_heads": 32,
+ "num_layers": 28,
+ "out_channels": 128,
+ "patch_size": 1,
+ "patch_size_t": 1,
+ "qk_norm": "rms_norm_across_heads",
+}
+DIFFUSERS_VAE_CONFIG = {
+ "_class_name": "AutoencoderKLLTXVideo",
+ "_diffusers_version": "0.32.0.dev0",
+ "block_out_channels": [128, 256, 512, 512],
+ "decoder_causal": False,
+ "encoder_causal": True,
+ "in_channels": 3,
+ "latent_channels": 128,
+ "layers_per_block": [4, 3, 3, 3, 4],
+ "out_channels": 3,
+ "patch_size": 4,
+ "patch_size_t": 1,
+ "resnet_norm_eps": 1e-06,
+ "scaling_factor": 1.0,
+ "spatio_temporal_scaling": [True, True, True, False],
+}
+
+OURS_SCHEDULER_CONFIG = {
+ "_class_name": "RectifiedFlowScheduler",
+ "_diffusers_version": "0.25.1",
+ "num_train_timesteps": 1000,
+ "shifting": "SD3",
+ "base_resolution": None,
+ "target_shift_terminal": 0.1,
+}
+
+OURS_TRANSFORMER_CONFIG = {
+ "_class_name": "Transformer3DModel",
+ "_diffusers_version": "0.25.1",
+ "_name_or_path": "PixArt-alpha/PixArt-XL-2-256x256",
+ "activation_fn": "gelu-approximate",
+ "attention_bias": True,
+ "attention_head_dim": 64,
+ "attention_type": "default",
+ "caption_channels": 4096,
+ "cross_attention_dim": 2048,
+ "double_self_attention": False,
+ "dropout": 0.0,
+ "in_channels": 128,
+ "norm_elementwise_affine": False,
+ "norm_eps": 1e-06,
+ "norm_num_groups": 32,
+ "num_attention_heads": 32,
+ "num_embeds_ada_norm": 1000,
+ "num_layers": 28,
+ "num_vector_embeds": None,
+ "only_cross_attention": False,
+ "out_channels": 128,
+ "project_to_2d_pos": True,
+ "upcast_attention": False,
+ "use_linear_projection": False,
+ "qk_norm": "rms_norm",
+ "standardization_norm": "rms_norm",
+ "positional_embedding_type": "rope",
+ "positional_embedding_theta": 10000.0,
+ "positional_embedding_max_pos": [20, 2048, 2048],
+ "timestep_scale_multiplier": 1000,
+}
+OURS_VAE_CONFIG = {
+ "_class_name": "CausalVideoAutoencoder",
+ "dims": 3,
+ "in_channels": 3,
+ "out_channels": 3,
+ "latent_channels": 128,
+ "blocks": [
+ ["res_x", 4],
+ ["compress_all", 1],
+ ["res_x_y", 1],
+ ["res_x", 3],
+ ["compress_all", 1],
+ ["res_x_y", 1],
+ ["res_x", 3],
+ ["compress_all", 1],
+ ["res_x", 3],
+ ["res_x", 4],
+ ],
+ "scaling_factor": 1.0,
+ "norm_layer": "pixel_norm",
+ "patch_size": 4,
+ "latent_log_var": "uniform",
+ "use_quant_conv": False,
+ "causal_decoder": False,
+}
+
+
+diffusers_and_ours_config_mapping = {
+ make_hashable_key(DIFFUSERS_SCHEDULER_CONFIG): OURS_SCHEDULER_CONFIG,
+ make_hashable_key(DIFFUSERS_TRANSFORMER_CONFIG): OURS_TRANSFORMER_CONFIG,
+ make_hashable_key(DIFFUSERS_VAE_CONFIG): OURS_VAE_CONFIG,
+}
+
+
+TRANSFORMER_KEYS_RENAME_DICT = {
+ "proj_in": "patchify_proj",
+ "time_embed": "adaln_single",
+ "norm_q": "q_norm",
+ "norm_k": "k_norm",
+}
+
+
+VAE_KEYS_RENAME_DICT = {
+ "decoder.up_blocks.3.conv_in": "decoder.up_blocks.7",
+ "decoder.up_blocks.3.upsamplers.0": "decoder.up_blocks.8",
+ "decoder.up_blocks.3": "decoder.up_blocks.9",
+ "decoder.up_blocks.2.upsamplers.0": "decoder.up_blocks.5",
+ "decoder.up_blocks.2.conv_in": "decoder.up_blocks.4",
+ "decoder.up_blocks.2": "decoder.up_blocks.6",
+ "decoder.up_blocks.1.upsamplers.0": "decoder.up_blocks.2",
+ "decoder.up_blocks.1": "decoder.up_blocks.3",
+ "decoder.up_blocks.0": "decoder.up_blocks.1",
+ "decoder.mid_block": "decoder.up_blocks.0",
+ "encoder.down_blocks.3": "encoder.down_blocks.8",
+ "encoder.down_blocks.2.downsamplers.0": "encoder.down_blocks.7",
+ "encoder.down_blocks.2": "encoder.down_blocks.6",
+ "encoder.down_blocks.1.downsamplers.0": "encoder.down_blocks.4",
+ "encoder.down_blocks.1.conv_out": "encoder.down_blocks.5",
+ "encoder.down_blocks.1": "encoder.down_blocks.3",
+ "encoder.down_blocks.0.conv_out": "encoder.down_blocks.2",
+ "encoder.down_blocks.0.downsamplers.0": "encoder.down_blocks.1",
+ "encoder.down_blocks.0": "encoder.down_blocks.0",
+ "encoder.mid_block": "encoder.down_blocks.9",
+ "conv_shortcut.conv": "conv_shortcut",
+ "resnets": "res_blocks",
+ "norm3": "norm3.norm",
+ "latents_mean": "per_channel_statistics.mean-of-means",
+ "latents_std": "per_channel_statistics.std-of-means",
+}
diff --git a/ltx_video/utils/prompt_enhance_utils.py b/ltx_video/utils/prompt_enhance_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d9488587f6738058214e08a8d3cac1ad4d60632
--- /dev/null
+++ b/ltx_video/utils/prompt_enhance_utils.py
@@ -0,0 +1,214 @@
+import logging
+from typing import Union, List, Optional
+
+import torch
+from PIL import Image
+
+logger = logging.getLogger(__name__) # pylint: disable=invalid-name
+
+T2V_CINEMATIC_PROMPT = """You are an expert cinematic director with many award winning movies, When writing prompts based on the user input, focus on detailed, chronological descriptions of actions and scenes.
+Include specific movements, appearances, camera angles, and environmental details - all in a single flowing paragraph.
+Start directly with the action, and keep descriptions literal and precise.
+Think like a cinematographer describing a shot list.
+Do not change the user input intent, just enhance it.
+Keep within 150 words.
+For best results, build your prompts using this structure:
+Start with main action in a single sentence
+Add specific details about movements and gestures
+Describe character/object appearances precisely
+Include background and environment details
+Specify camera angles and movements
+Describe lighting and colors
+Note any changes or sudden events
+Do not exceed the 150 word limit!
+Output the enhanced prompt only.
+"""
+
+I2V_CINEMATIC_PROMPT = """You are an expert cinematic director with many award winning movies, When writing prompts based on the user input, focus on detailed, chronological descriptions of actions and scenes.
+Include specific movements, appearances, camera angles, and environmental details - all in a single flowing paragraph.
+Start directly with the action, and keep descriptions literal and precise.
+Think like a cinematographer describing a shot list.
+Keep within 150 words.
+For best results, build your prompts using this structure:
+Describe the image first and then add the user input. Image description should be in first priority! Align to the image caption if it contradicts the user text input.
+Start with main action in a single sentence
+Add specific details about movements and gestures
+Describe character/object appearances precisely
+Include background and environment details
+Specify camera angles and movements
+Describe lighting and colors
+Note any changes or sudden events
+Align to the image caption if it contradicts the user text input.
+Do not exceed the 150 word limit!
+Output the enhanced prompt only.
+"""
+
+
+def tensor_to_pil(tensor):
+ # Ensure tensor is in range [-1, 1]
+ assert tensor.min() >= -1 and tensor.max() <= 1
+
+ # Convert from [-1, 1] to [0, 1]
+ tensor = (tensor + 1) / 2
+
+ # Rearrange from [C, H, W] to [H, W, C]
+ tensor = tensor.permute(1, 2, 0)
+
+ # Convert to numpy array and then to uint8 range [0, 255]
+ numpy_image = (tensor.cpu().numpy() * 255).astype("uint8")
+
+ # Convert to PIL Image
+ return Image.fromarray(numpy_image)
+
+
+def generate_cinematic_prompt(
+ image_caption_model,
+ image_caption_processor,
+ prompt_enhancer_model,
+ prompt_enhancer_tokenizer,
+ prompt: Union[str, List[str]],
+ images: Optional[List] = None,
+ max_new_tokens: int = 256,
+) -> List[str]:
+ prompts = [prompt] if isinstance(prompt, str) else prompt
+
+ if images is None:
+ prompts = _generate_t2v_prompt(
+ prompt_enhancer_model,
+ prompt_enhancer_tokenizer,
+ prompts,
+ max_new_tokens,
+ T2V_CINEMATIC_PROMPT,
+ )
+ else:
+
+ prompts = _generate_i2v_prompt(
+ image_caption_model,
+ image_caption_processor,
+ prompt_enhancer_model,
+ prompt_enhancer_tokenizer,
+ prompts,
+ images,
+ max_new_tokens,
+ I2V_CINEMATIC_PROMPT,
+ )
+
+ return prompts
+
+
+def _get_first_frames_from_conditioning_item(conditioning_item) -> List[Image.Image]:
+ frames_tensor = conditioning_item.media_item
+ return [
+ tensor_to_pil(frames_tensor[i, :, 0, :, :])
+ for i in range(frames_tensor.shape[0])
+ ]
+
+
+def _generate_t2v_prompt(
+ prompt_enhancer_model,
+ prompt_enhancer_tokenizer,
+ prompts: List[str],
+ max_new_tokens: int,
+ system_prompt: str,
+) -> List[str]:
+ messages = [
+ [
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": f"user_prompt: {p}"},
+ ]
+ for p in prompts
+ ]
+
+ texts = [
+ prompt_enhancer_tokenizer.apply_chat_template(
+ m, tokenize=False, add_generation_prompt=True
+ )
+ for m in messages
+ ]
+ model_inputs = prompt_enhancer_tokenizer(texts, return_tensors="pt").to(
+ prompt_enhancer_model.device
+ )
+
+ return _generate_and_decode_prompts(
+ prompt_enhancer_model, prompt_enhancer_tokenizer, model_inputs, max_new_tokens
+ )
+
+
+def _generate_i2v_prompt(
+ image_caption_model,
+ image_caption_processor,
+ prompt_enhancer_model,
+ prompt_enhancer_tokenizer,
+ prompts: List[str],
+ first_frames: List[Image.Image],
+ max_new_tokens: int,
+ system_prompt: str,
+) -> List[str]:
+ image_captions = _generate_image_captions(
+ image_caption_model, image_caption_processor, first_frames
+ )
+ if len(image_captions) == 1 and len(image_captions) < len(prompts):
+ image_captions *= len(prompts)
+ messages = [
+ [
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": f"user_prompt: {p}\nimage_caption: {c}"},
+ ]
+ for p, c in zip(prompts, image_captions)
+ ]
+
+ texts = [
+ prompt_enhancer_tokenizer.apply_chat_template(
+ m, tokenize=False, add_generation_prompt=True
+ )
+ for m in messages
+ ]
+ out_prompts = []
+ for text in texts:
+ model_inputs = prompt_enhancer_tokenizer(text, return_tensors="pt").to(
+ prompt_enhancer_model.device
+ )
+ out_prompts.append(_generate_and_decode_prompts(prompt_enhancer_model, prompt_enhancer_tokenizer, model_inputs, max_new_tokens)[0])
+
+ return out_prompts
+
+
+def _generate_image_captions(
+ image_caption_model,
+ image_caption_processor,
+ images: List[Image.Image],
+ system_prompt: str = "",
+) -> List[str]:
+ image_caption_prompts = [system_prompt] * len(images)
+ inputs = image_caption_processor(
+ image_caption_prompts, images, return_tensors="pt"
+ ).to("cuda") #.to(image_caption_model.device)
+
+ with torch.inference_mode():
+ generated_ids = image_caption_model.generate(
+ input_ids=inputs["input_ids"],
+ pixel_values=inputs["pixel_values"],
+ max_new_tokens=1024,
+ do_sample=False,
+ num_beams=3,
+ )
+
+ return image_caption_processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+
+def _generate_and_decode_prompts(
+ prompt_enhancer_model, prompt_enhancer_tokenizer, model_inputs, max_new_tokens: int
+) -> List[str]:
+ with torch.inference_mode():
+ outputs = prompt_enhancer_model.generate(
+ **model_inputs, max_new_tokens=max_new_tokens
+ )
+ generated_ids = [
+ output_ids[len(input_ids) :]
+ for input_ids, output_ids in zip(model_inputs.input_ids, outputs)
+ ]
+ decoded_prompts = prompt_enhancer_tokenizer.batch_decode(
+ generated_ids, skip_special_tokens=True
+ )
+
+ return decoded_prompts
diff --git a/ltx_video/utils/skip_layer_strategy.py b/ltx_video/utils/skip_layer_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..30f9016e1cf2abbe62360775e914fa63876e4cf7
--- /dev/null
+++ b/ltx_video/utils/skip_layer_strategy.py
@@ -0,0 +1,8 @@
+from enum import Enum, auto
+
+
+class SkipLayerStrategy(Enum):
+ AttentionSkip = auto()
+ AttentionValues = auto()
+ Residual = auto()
+ TransformerBlock = auto()
diff --git a/ltx_video/utils/torch_utils.py b/ltx_video/utils/torch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..991b07c36269ef4dafb88a85834f2596647ba816
--- /dev/null
+++ b/ltx_video/utils/torch_utils.py
@@ -0,0 +1,25 @@
+import torch
+from torch import nn
+
+
+def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
+ """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+ dims_to_append = target_dims - x.ndim
+ if dims_to_append < 0:
+ raise ValueError(
+ f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+ )
+ elif dims_to_append == 0:
+ return x
+ return x[(...,) + (None,) * dims_to_append]
+
+
+class Identity(nn.Module):
+ """A placeholder identity operator that is argument-insensitive."""
+
+ def __init__(self, *args, **kwargs) -> None: # pylint: disable=unused-argument
+ super().__init__()
+
+ # pylint: disable=unused-argument
+ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+ return x
diff --git a/preprocessing/matanyone/app.py b/preprocessing/matanyone/app.py
index f92dab94a8b7c6fbde3dc19c9bb67b2bf8381f98..8e17c24b08bbbbaace3085cd830365bfa0ce54c6 100644
--- a/preprocessing/matanyone/app.py
+++ b/preprocessing/matanyone/app.py
@@ -457,13 +457,20 @@ def export_to_vace_video_input(foreground_video_output):
gr.Info("Masked Video Input transferred to Vace For Inpainting")
return "V#" + str(time.time()), foreground_video_output
-def export_to_vace_video_mask(foreground_video_output, alpha_video_output):
- gr.Info("Masked Video Input and Full Mask transferred to Vace For Inpainting")
- return "MV#" + str(time.time()), foreground_video_output, alpha_video_output
+def export_to_current_video_engine(foreground_video_output, alpha_video_output):
+ gr.Info("Masked Video Input and Full Mask transferred to Current Video Engine For Inpainting")
+ # return "MV#" + str(time.time()), foreground_video_output, alpha_video_output
+ return foreground_video_output, alpha_video_output
-def teleport_to_vace():
+def teleport_to_video_tab():
+ return gr.Tabs(selected="video_gen")
+
+def teleport_to_vace_1_3B():
return gr.Tabs(selected="video_gen"), gr.Dropdown(value="vace_1.3B")
+def teleport_to_vace_14B():
+ return gr.Tabs(selected="video_gen"), gr.Dropdown(value="vace_14B")
+
def display(tabs, model_choice, vace_video_input, vace_video_mask, video_prompt_video_guide_trigger):
# my_tab.select(fn=load_unload_models, inputs=[], outputs=[])
@@ -596,13 +603,16 @@ def display(tabs, model_choice, vace_video_input, vace_video_mask, video_prompt_
alpha_output_button = gr.Button(value="Alpha Mask Output", visible=False, elem_classes="new_button")
with gr.Row():
with gr.Row(visible= False):
- export_to_vace_video_input_btn = gr.Button("Export to Vace Video Input Video For Inpainting", visible= False)
+ export_to_vace_video_14B_btn = gr.Button("Export to current Video Input Video For Inpainting", visible= False)
with gr.Row(visible= True):
- export_to_vace_video_mask_btn = gr.Button("Export to Vace Video Input and Video Mask", visible= False)
+ export_to_current_video_engine_btn = gr.Button("Export to current Video Input and Video Mask", visible= False)
- export_to_vace_video_input_btn.click(fn=export_to_vace_video_input, inputs= [foreground_video_output], outputs= [video_prompt_video_guide_trigger, vace_video_input])
- export_to_vace_video_mask_btn.click(fn=export_to_vace_video_mask, inputs= [foreground_video_output, alpha_video_output], outputs= [video_prompt_video_guide_trigger, vace_video_input, vace_video_mask]).then(
- fn=teleport_to_vace, inputs=[], outputs=[tabs, model_choice])
+ export_to_vace_video_14B_btn.click( fn=teleport_to_vace_14B, inputs=[], outputs=[tabs, model_choice]).then(
+ fn=export_to_current_video_engine, inputs= [foreground_video_output, alpha_video_output], outputs= [video_prompt_video_guide_trigger, vace_video_input, vace_video_mask])
+
+ export_to_current_video_engine_btn.click( fn=export_to_current_video_engine, inputs= [foreground_video_output, alpha_video_output], outputs= [vace_video_input, vace_video_mask]).then( #video_prompt_video_guide_trigger,
+ fn=teleport_to_video_tab, inputs= [], outputs= [tabs])
+
# first step: get the video information
extract_frames_button.click(
fn=get_frames_from_video,
@@ -649,7 +659,7 @@ def display(tabs, model_choice, vace_video_input, vace_video_mask, video_prompt_
outputs=[foreground_video_output, alpha_video_output]).then(
fn=video_matting,
inputs=[video_state, end_selection_slider, matting_type, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size],
- outputs=[foreground_video_output, alpha_video_output,foreground_video_output, alpha_video_output, export_to_vace_video_input_btn, export_to_vace_video_mask_btn]
+ outputs=[foreground_video_output, alpha_video_output,foreground_video_output, alpha_video_output, export_to_vace_video_14B_btn, export_to_current_video_engine_btn]
)
# click to get mask
@@ -669,7 +679,7 @@ def display(tabs, model_choice, vace_video_input, vace_video_mask, video_prompt_
click_state,
foreground_video_output, alpha_video_output,
template_frame,
- image_selection_slider, end_selection_slider, track_pause_number_slider,point_prompt, export_to_vace_video_input_btn, export_to_vace_video_mask_btn, matting_type, clear_button_click,
+ image_selection_slider, end_selection_slider, track_pause_number_slider,point_prompt, export_to_vace_video_14B_btn, export_to_current_video_engine_btn, matting_type, clear_button_click,
add_mask_button, matting_button, template_frame, foreground_video_output, alpha_video_output, remove_mask_button, foreground_output_button, alpha_output_button, mask_dropdown, video_info, step2_title
],
queue=False,
@@ -684,7 +694,7 @@ def display(tabs, model_choice, vace_video_input, vace_video_mask, video_prompt_
click_state,
foreground_video_output, alpha_video_output,
template_frame,
- image_selection_slider , end_selection_slider, track_pause_number_slider,point_prompt, export_to_vace_video_input_btn, export_to_vace_video_mask_btn, matting_type, clear_button_click,
+ image_selection_slider , end_selection_slider, track_pause_number_slider,point_prompt, export_to_vace_video_14B_btn, export_to_current_video_engine_btn, matting_type, clear_button_click,
add_mask_button, matting_button, template_frame, foreground_video_output, alpha_video_output, remove_mask_button, foreground_output_button, alpha_output_button, mask_dropdown, video_info, step2_title
],
queue=False,
diff --git a/requirements.txt b/requirements.txt
index 5327d4ba787ea00f83bf96cdf5b9aaffaa8980ae..9e840a62c9722f88e0f3d3bb347ae8f1b1e02fab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,8 @@ torch>=2.4.0
torchvision>=0.19.0
opencv-python>=4.9.0.80
diffusers>=0.31.0
-transformers==4.49.0
+transformers==4.51.3
+#transformers==4.46.3 # was needed by llamallava used by i2v hunyuan before patch
tokenizers>=0.20.3
accelerate>=1.1.1
tqdm
@@ -16,7 +17,7 @@ gradio==5.23.0
numpy>=1.23.5,<2
einops
moviepy==1.0.3
-mmgp==3.4.4
+mmgp==3.4.5
peft==0.14.0
mutagen
pydantic==2.10.6
@@ -29,4 +30,5 @@ segment-anything
omegaconf
hydra-core
librosa
+#loguru
# rembg==2.0.65
diff --git a/wan/configs/wan_i2v_14B.py b/wan/configs/wan_i2v_14B.py
index 12e8e205bffb343a6e27d2828fb573db1d6349f8..7812c929c5bc4552a960ee37a80a1a4448c3a9cb 100644
--- a/wan/configs/wan_i2v_14B.py
+++ b/wan/configs/wan_i2v_14B.py
@@ -15,7 +15,7 @@ i2v_14B.t5_tokenizer = 'google/umt5-xxl'
# clip
i2v_14B.clip_model = 'clip_xlm_roberta_vit_h_14'
i2v_14B.clip_dtype = torch.float16
-i2v_14B.clip_checkpoint = 'models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth'
+i2v_14B.clip_checkpoint = 'xlm-roberta-large/models_clip_open-clip-xlm-roberta-large-vit-huge-14-bf16.safetensors'
i2v_14B.clip_tokenizer = 'xlm-roberta-large'
# vae
diff --git a/wan/diffusion_forcing.py b/wan/diffusion_forcing.py
index 87352ebf6f6815450bc8c4570a7e9cdcf5a970d7..bcb735d47e4f1eb66fa702711e935314644208f8 100644
--- a/wan/diffusion_forcing.py
+++ b/wan/diffusion_forcing.py
@@ -80,11 +80,11 @@ class DTT2V:
return self._guidance_scale > 1
def encode_image(
- self, image: PipelineImageInput, height: int, width: int, num_frames: int, tile_size = 0, causal_block_size = 0
+ self, image_start: PipelineImageInput, height: int, width: int, num_frames: int, tile_size = 0, causal_block_size = 0
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
# prefix_video
- prefix_video = np.array(image.resize((width, height))).transpose(2, 0, 1)
+ prefix_video = np.array(image_start.resize((width, height))).transpose(2, 0, 1)
prefix_video = torch.tensor(prefix_video).unsqueeze(1) # .to(image_embeds.dtype).unsqueeze(1)
if prefix_video.dtype == torch.uint8:
prefix_video = (prefix_video.float() / (255.0 / 2.0)) - 1.0
@@ -185,19 +185,19 @@ class DTT2V:
@torch.no_grad()
def generate(
self,
- prompt: Union[str, List[str]],
- negative_prompt: Union[str, List[str]] = "",
- image: PipelineImageInput = None,
+ input_prompt: Union[str, List[str]],
+ n_prompt: Union[str, List[str]] = "",
+ image_start: PipelineImageInput = None,
input_video = None,
height: int = 480,
width: int = 832,
fit_into_canvas = True,
- num_frames: int = 97,
- num_inference_steps: int = 50,
+ frame_num: int = 97,
+ sampling_steps: int = 50,
shift: float = 1.0,
- guidance_scale: float = 5.0,
+ guide_scale: float = 5.0,
seed: float = 0.0,
- addnoise_condition: int = 0,
+ overlap_noise: int = 0,
ar_step: int = 5,
causal_block_size: int = 5,
causal_attention: bool = True,
@@ -208,13 +208,14 @@ class DTT2V:
slg_start = 0.0,
slg_end = 1.0,
callback = None,
+ **bbargs
):
self._interrupt = False
generator = torch.Generator(device=self.device)
generator.manual_seed(seed)
- self._guidance_scale = guidance_scale
- num_frames = max(17, num_frames) # must match causal_block_size for value of 5
- num_frames = int( round( (num_frames - 17) / 20)* 20 + 17 )
+ self._guidance_scale = guide_scale
+ frame_num = max(17, frame_num) # must match causal_block_size for value of 5
+ frame_num = int( round( (frame_num - 17) / 20)* 20 + 17 )
if ar_step == 0:
causal_block_size = 1
@@ -226,29 +227,29 @@ class DTT2V:
if input_video != None:
_ , _ , height, width = input_video.shape
- elif image != None:
- image = image[0]
- frame_width, frame_height = image.size
+ elif image_start != None:
+ image_start = image_start[0]
+ frame_width, frame_height = image_start.size
height, width = calculate_new_dimensions(height, width, frame_height, frame_width, fit_into_canvas)
- image = np.array(image.resize((width, height))).transpose(2, 0, 1)
+ image_start = np.array(image_start.resize((width, height))).transpose(2, 0, 1)
- latent_length = (num_frames - 1) // 4 + 1
+ latent_length = (frame_num - 1) // 4 + 1
latent_height = height // 8
latent_width = width // 8
if self._interrupt:
return None
- prompt_embeds = self.text_encoder([prompt], self.device)[0]
+ prompt_embeds = self.text_encoder([input_prompt], self.device)[0]
prompt_embeds = prompt_embeds.to(self.dtype).to(self.device)
if self.do_classifier_free_guidance:
- negative_prompt_embeds = self.text_encoder([negative_prompt], self.device)[0]
+ negative_prompt_embeds = self.text_encoder([n_prompt], self.device)[0]
negative_prompt_embeds = negative_prompt_embeds.to(self.dtype).to(self.device)
if self._interrupt:
return None
- self.scheduler.set_timesteps(num_inference_steps, device=self.device, shift=shift)
+ self.scheduler.set_timesteps(sampling_steps, device=self.device, shift=shift)
init_timesteps = self.scheduler.timesteps
fps_embeds = [fps] #* prompt_embeds[0].shape[0]
fps_embeds = [0 if i == 16 else 1 for i in fps_embeds]
@@ -256,14 +257,14 @@ class DTT2V:
output_video = input_video
- if image is not None or output_video is not None: # i !=0
+ if image_start is not None or output_video is not None: # i !=0
if output_video is not None:
prefix_video = output_video.to(self.device)
else:
causal_block_size = 1
causal_attention = False
ar_step = 0
- prefix_video = image
+ prefix_video = image_start
prefix_video = torch.tensor(prefix_video).unsqueeze(1) # .to(image_embeds.dtype).unsqueeze(1)
if prefix_video.dtype == torch.uint8:
prefix_video = (prefix_video.float() / (255.0 / 2.0)) - 1.0
@@ -301,7 +302,7 @@ class DTT2V:
sample_scheduler = FlowUniPCMultistepScheduler(
num_train_timesteps=1000, shift=1, use_dynamic_shifting=False
)
- sample_scheduler.set_timesteps(num_inference_steps, device=self.device, shift=shift)
+ sample_scheduler.set_timesteps(sampling_steps, device=self.device, shift=shift)
sample_schedulers.append(sample_scheduler)
sample_schedulers_counter = [0] * base_num_frames_iter
@@ -316,8 +317,8 @@ class DTT2V:
for i, timestep_i in enumerate(step_matrix):
valid_interval_start, valid_interval_end = valid_interval[i]
timestep = timestep_i[None, valid_interval_start:valid_interval_end].clone()
- if addnoise_condition > 0 and valid_interval_start < predix_video_latent_length:
- timestep[:, valid_interval_start:predix_video_latent_length] = addnoise_condition
+ if overlap_noise > 0 and valid_interval_start < predix_video_latent_length:
+ timestep[:, valid_interval_start:predix_video_latent_length] = overlap_noise
time_steps_comb.append(timestep)
self.model.compute_teacache_threshold(self.model.teacache_start_step, time_steps_comb, self.model.teacache_multiplier)
del time_steps_comb
@@ -341,9 +342,9 @@ class DTT2V:
valid_interval_start, valid_interval_end = valid_interval[i]
timestep = timestep_i[None, valid_interval_start:valid_interval_end].clone()
latent_model_input = latents[:, valid_interval_start:valid_interval_end, :, :].clone()
- if addnoise_condition > 0 and valid_interval_start < predix_video_latent_length:
- noise_factor = 0.001 * addnoise_condition
- timestep_for_noised_condition = addnoise_condition
+ if overlap_noise > 0 and valid_interval_start < predix_video_latent_length:
+ noise_factor = 0.001 * overlap_noise
+ timestep_for_noised_condition = overlap_noise
latent_model_input[:, valid_interval_start:predix_video_latent_length] = (
latent_model_input[:, valid_interval_start:predix_video_latent_length]
* (1.0 - noise_factor)
@@ -395,7 +396,7 @@ class DTT2V:
)[0]
if self._interrupt:
return None
- noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+ noise_pred = noise_pred_uncond + guide_scale * (noise_pred_cond - noise_pred_uncond)
del noise_pred_cond, noise_pred_uncond
for idx in range(valid_interval_start, valid_interval_end):
if update_mask_i[idx].item():
diff --git a/wan/image2video.py b/wan/image2video.py
index db36ec8999a6e004eb436e29780b2cb9034fa70b..d8c5aaa0e7cd1d888661955993b999af7db81684 100644
--- a/wan/image2video.py
+++ b/wan/image2video.py
@@ -80,9 +80,9 @@ class WanI2V:
self.clip = CLIPModel(
dtype=config.clip_dtype,
device=self.device,
- checkpoint_path=os.path.join(checkpoint_dir,
+ checkpoint_path=os.path.join(checkpoint_dir ,
config.clip_checkpoint),
- tokenizer_path=os.path.join(checkpoint_dir, config.clip_tokenizer))
+ tokenizer_path=os.path.join(checkpoint_dir , config.clip_tokenizer))
logging.info(f"Creating WanModel from {model_filename[-1]}")
from mmgp import offload
@@ -116,8 +116,8 @@ class WanI2V:
def generate(self,
input_prompt,
- img,
- img2 = None,
+ image_start,
+ image_end = None,
height =720,
width = 1280,
fit_into_canvas = True,
@@ -137,11 +137,12 @@ class WanI2V:
slg_end = 1.0,
cfg_star_switch = True,
cfg_zero_step = 5,
- add_frames_for_end_image = True,
audio_scale=None,
audio_cfg_scale=None,
audio_proj=None,
audio_context_lens=None,
+ model_filename = None,
+ **bbargs
):
r"""
Generates video frames from input image and text prompt using diffusion process.
@@ -149,7 +150,7 @@ class WanI2V:
Args:
input_prompt (`str`):
Text prompt for content generation.
- img (PIL.Image.Image):
+ image_start (PIL.Image.Image):
Input image tensor. Shape: [3, H, W]
max_area (`int`, *optional*, defaults to 720*1280):
Maximum pixel area for latent space calculation. Controls video resolution scaling
@@ -179,17 +180,20 @@ class WanI2V:
- H: Frame height (from max_area)
- W: Frame width from max_area)
"""
- img = TF.to_tensor(img)
+
+ add_frames_for_end_image = "image2video" in model_filename or "fantasy" in model_filename
+
+ image_start = TF.to_tensor(image_start)
lat_frames = int((frame_num - 1) // self.vae_stride[0] + 1)
- any_end_frame = img2 !=None
+ any_end_frame = image_end !=None
if any_end_frame:
any_end_frame = True
- img2 = TF.to_tensor(img2)
+ image_end = TF.to_tensor(image_end)
if add_frames_for_end_image:
frame_num +=1
lat_frames = int((frame_num - 2) // self.vae_stride[0] + 2)
- h, w = img.shape[1:]
+ h, w = image_start.shape[1:]
h, w = calculate_new_dimensions(height, width, h, w, fit_into_canvas)
@@ -203,13 +207,13 @@ class WanI2V:
w = lat_w * self.vae_stride[2]
clip_image_size = self.clip.model.image_size
- img_interpolated = resize_lanczos(img, h, w).sub_(0.5).div_(0.5).unsqueeze(0).transpose(0,1).to(self.device) #, self.dtype
- img = resize_lanczos(img, clip_image_size, clip_image_size)
- img = img.sub_(0.5).div_(0.5).to(self.device) #, self.dtype
- if img2!= None:
- img_interpolated2 = resize_lanczos(img2, h, w).sub_(0.5).div_(0.5).unsqueeze(0).transpose(0,1).to(self.device) #, self.dtype
- img2 = resize_lanczos(img2, clip_image_size, clip_image_size)
- img2 = img2.sub_(0.5).div_(0.5).to(self.device) #, self.dtype
+ img_interpolated = resize_lanczos(image_start, h, w).sub_(0.5).div_(0.5).unsqueeze(0).transpose(0,1).to(self.device) #, self.dtype
+ image_start = resize_lanczos(image_start, clip_image_size, clip_image_size)
+ image_start = image_start.sub_(0.5).div_(0.5).to(self.device) #, self.dtype
+ if image_end!= None:
+ img_interpolated2 = resize_lanczos(image_end, h, w).sub_(0.5).div_(0.5).unsqueeze(0).transpose(0,1).to(self.device) #, self.dtype
+ image_end = resize_lanczos(image_end, clip_image_size, clip_image_size)
+ image_end = image_end.sub_(0.5).div_(0.5).to(self.device) #, self.dtype
max_seq_len = lat_frames * lat_h * lat_w // ( self.patch_size[1] * self.patch_size[2])
@@ -247,7 +251,7 @@ class WanI2V:
if self._interrupt:
return None
- clip_context = self.clip.visual([img[:, None, :, :]])
+ clip_context = self.clip.visual([image_start[:, None, :, :]])
from mmgp import offload
offload.last_offload_obj.unload_all()
@@ -263,7 +267,7 @@ class WanI2V:
img_interpolated,
torch.zeros(3, frame_num-1, h, w, device=self.device, dtype= self.VAE_dtype)
], dim=1).to(self.device)
- img, img2, img_interpolated, img_interpolated2 = None, None, None, None
+ image_start, image_end, img_interpolated, img_interpolated2 = None, None, None, None
lat_y = self.vae.encode([enc], VAE_tile_size, any_end_frame= any_end_frame and add_frames_for_end_image)[0]
y = torch.concat([msk, lat_y])
diff --git a/wan/modules/attention.py b/wan/modules/attention.py
index f70a85b13e4fc28c0f7fb4f433bc53bcd32bbe1f..635b6d21c498e0f1c34a03990086c77b7fb2d0bd 100644
--- a/wan/modules/attention.py
+++ b/wan/modules/attention.py
@@ -4,6 +4,8 @@ from importlib.metadata import version
from mmgp import offload
import torch.nn.functional as F
+major, minor = torch.cuda.get_device_capability(None)
+bfloat16_supported = major >= 8
try:
from xformers.ops import memory_efficient_attention
@@ -56,10 +58,6 @@ def sageattn_wrapper(
attention_length
):
q,k, v = qkv_list
- padding_length = q.shape[1] -attention_length
- q = q[:, :attention_length, :, : ]
- k = k[:, :attention_length, :, : ]
- v = v[:, :attention_length, :, : ]
if True:
qkv_list = [q,k,v]
del q, k ,v
@@ -70,9 +68,6 @@ def sageattn_wrapper(
qkv_list.clear()
- if padding_length > 0:
- o = torch.cat([o, torch.empty( (padding_length, *o.shape[-2:]), dtype= o.dtype, device=o.device ) ], 0)
-
return o
# try:
@@ -104,23 +99,20 @@ def sageattn_wrapper(
@torch.compiler.disable()
def sdpa_wrapper(
qkv_list,
- attention_length
+ attention_length,
+ attention_mask = None
):
- q,k, v = qkv_list
- padding_length = q.shape[1] -attention_length
- q = q[:attention_length, :].transpose(1,2)
- k = k[:attention_length, :].transpose(1,2)
- v = v[:attention_length, :].transpose(1,2)
-
- o = F.scaled_dot_product_attention(
- q, k, v, attn_mask=None, is_causal=False
- ).transpose(1,2)
+ q, k, v = qkv_list
+
+ q = q.transpose(1,2)
+ k = k.transpose(1,2)
+ v = v.transpose(1,2)
+ if attention_mask != None:
+ attention_mask = attention_mask.transpose(1,2)
+ o = F.scaled_dot_product_attention( q, k, v, attn_mask=attention_mask, is_causal=False).transpose(1,2)
del q, k ,v
qkv_list.clear()
- if padding_length > 0:
- o = torch.cat([o, torch.empty( (padding_length, *o.shape[-2:]), dtype= o.dtype, device=o.device ) ], 0)
-
return o
@@ -149,7 +141,19 @@ __all__ = [
'attention',
]
+def get_cu_seqlens(batch_size, lens, max_len):
+ cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device="cuda")
+
+ for i in range(batch_size):
+ s = lens[i]
+ s1 = i * max_len + s
+ s2 = (i + 1) * max_len
+ cu_seqlens[2 * i + 1] = s1
+ cu_seqlens[2 * i + 2] = s2
+ return cu_seqlens
+
+@torch.compiler.disable()
def pay_attention(
qkv_list,
dropout_p=0.,
@@ -159,21 +163,34 @@ def pay_attention(
deterministic=False,
version=None,
force_attention= None,
+ attention_mask = None,
cross_attn= False,
- k_lens = None
+ q_lens = None,
+ k_lens = None,
):
-
+ # format : torch.Size([batches, tokens, heads, head_features])
+ # assume if q_lens is non null, each q is padded up to lq (one q out of two will need to be discarded or ignored)
+ # assume if k_lens is non null, each k is padded up to lk (one k out of two will need to be discarded or ignored)
+ if attention_mask != None:
+ force_attention = "sdpa"
attn = offload.shared_state["_attention"] if force_attention== None else force_attention
+
q,k,v = qkv_list
qkv_list.clear()
-
- # params
- b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype
+ out_dtype = q.dtype
+ if q.dtype == torch.bfloat16 and not bfloat16_supported:
+ q = q.to(torch.float16)
+ k = k.to(torch.float16)
+ v = v.to(torch.float16)
+ final_padding = 0
+ b, lq, lk = q.size(0), q.size(1), k.size(1)
q = q.to(v.dtype)
k = k.to(v.dtype)
- if b > 0 and k_lens != None and attn in ("sage2", "sdpa"):
- # Poor's man var len attention
+ if b > 1 and k_lens != None and attn in ("sage2", "sdpa"):
+ assert attention_mask == None
+ # Poor's man var k len attention
+ assert q_lens == None
chunk_sizes = []
k_sizes = []
current_size = k_lens[0]
@@ -203,6 +220,15 @@ def pay_attention(
q_chunks, k_chunks, v_chunks = None, None, None
o = torch.cat(o, dim = 0)
return o
+ elif (q_lens != None or k_lens != None) and attn in ("sage2", "sdpa"):
+ assert b == 1
+ szq = q_lens[0].item() if q_lens != None else lq
+ szk = k_lens[0].item() if k_lens != None else lk
+ final_padding = lq - szq
+ q = q[:, :szq]
+ k = k[:, :szk]
+ v = v[:, :szk]
+
if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE:
warnings.warn(
'Flash attention 3 is not available, use flash attention 2 instead.'
@@ -211,16 +237,23 @@ def pay_attention(
if attn=="sage" or attn=="flash":
if b != 1 :
if k_lens == None:
- k_lens = torch.tensor( [lk] * b, dtype=torch.int32).to(device=q.device, non_blocking=True)
- k = torch.cat([u[:v] for u, v in zip(k, k_lens)])
- v = torch.cat([u[:v] for u, v in zip(v, k_lens)])
+ k_lens = torch.tensor( [lk] * b, dtype=torch.int32).to(device=q.device, non_blocking=True)
+ if q_lens == None:
+ q_lens = torch.tensor([lq] * b, dtype=torch.int32).to(device=q.device, non_blocking=True)
+ k = k.reshape(-1, *k.shape[-2:])
+ v = v.reshape(-1, *v.shape[-2:])
q = q.reshape(-1, *q.shape[-2:])
- q_lens = torch.tensor([lq] * b, dtype=torch.int32).to(device=q.device, non_blocking=True)
- cu_seqlens_q=torch.cat([k_lens.new_zeros([1]), q_lens]).cumsum(0, dtype=torch.int32)
- cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(0, dtype=torch.int32)
+ cu_seqlens_q=get_cu_seqlens(b, q_lens, lq)
+ cu_seqlens_k=get_cu_seqlens(b, k_lens, lk)
else:
- cu_seqlens_q = torch.tensor([0, lq], dtype=torch.int32, device="cuda")
- cu_seqlens_k = torch.tensor([0, lk], dtype=torch.int32, device="cuda")
+ szq = q_lens[0].item() if q_lens != None else lq
+ szk = k_lens[0].item() if k_lens != None else lk
+ if szq != lq or szk != lk:
+ cu_seqlens_q = torch.tensor([0, szq, lq], dtype=torch.int32, device="cuda")
+ cu_seqlens_k = torch.tensor([0, szk, lk], dtype=torch.int32, device="cuda")
+ else:
+ cu_seqlens_q = torch.tensor([0, lq], dtype=torch.int32, device="cuda")
+ cu_seqlens_k = torch.tensor([0, lk], dtype=torch.int32, device="cuda")
q = q.squeeze(0)
k = k.squeeze(0)
v = v.squeeze(0)
@@ -304,7 +337,7 @@ def pay_attention(
elif attn=="sdpa":
qkv_list = [q, k, v]
del q ,k ,v
- x = sdpa_wrapper( qkv_list, lq) #.unsqueeze(0)
+ x = sdpa_wrapper( qkv_list, lq, attention_mask = attention_mask) #.unsqueeze(0)
elif attn=="flash" and version == 3:
# Note: dropout_p, window_size are not supported in FA3 now.
x = flash_attn_interface.flash_attn_varlen_func(
@@ -339,10 +372,21 @@ def pay_attention(
elif attn=="xformers":
from xformers.ops.fmha.attn_bias import BlockDiagonalPaddedKeysMask
- if b != 1 and k_lens != None:
- attn_mask = BlockDiagonalPaddedKeysMask.from_seqlens([lq] * b , lk, list(k_lens) )
+ if k_lens == None and q_lens == None:
+ x = memory_efficient_attention(q, k, v )
+ elif k_lens != None and q_lens == None:
+ attn_mask = BlockDiagonalPaddedKeysMask.from_seqlens([lq] * b , lk , list(k_lens) )
+ x = memory_efficient_attention(q, k, v, attn_bias= attn_mask )
+ elif b == 1:
+ szq = q_lens[0].item() if q_lens != None else lq
+ szk = k_lens[0].item() if k_lens != None else lk
+ attn_mask = BlockDiagonalPaddedKeysMask.from_seqlens([szq, lq - szq ] , lk , [szk, 0] )
x = memory_efficient_attention(q, k, v, attn_bias= attn_mask )
else:
- x = memory_efficient_attention(q, k, v )
-
- return x.type(out_dtype)
\ No newline at end of file
+ assert False
+ x = x.type(out_dtype)
+ if final_padding > 0:
+ x = torch.cat([x, torch.empty( (x.shape[0], final_padding, *x.shape[-2:]), dtype= x.dtype, device=x.device ) ], 1)
+
+
+ return x
\ No newline at end of file
diff --git a/wan/modules/model.py b/wan/modules/model.py
index 8892454bb3ef18e49d6308cfc35545ea8346b554..9e01486d704a880d51fa4613e07451201e974380 100644
--- a/wan/modules/model.py
+++ b/wan/modules/model.py
@@ -589,6 +589,62 @@ class MLPProj(torch.nn.Module):
class WanModel(ModelMixin, ConfigMixin):
+ @staticmethod
+ def preprocess_loras(model_filename, sd):
+
+ first = next(iter(sd), None)
+ if first == None:
+ return sd
+
+ if first.startswith("lora_unet_"):
+ new_sd = {}
+ print("Converting Lora Safetensors format to Lora Diffusers format")
+ alphas = {}
+ repl_list = ["cross_attn", "self_attn", "ffn"]
+ src_list = ["_" + k + "_" for k in repl_list]
+ tgt_list = ["." + k + "." for k in repl_list]
+
+ for k,v in sd.items():
+ k = k.replace("lora_unet_blocks_","diffusion_model.blocks.")
+
+ for s,t in zip(src_list, tgt_list):
+ k = k.replace(s,t)
+
+ k = k.replace("lora_up","lora_B")
+ k = k.replace("lora_down","lora_A")
+
+ if "alpha" in k:
+ alphas[k] = v
+ else:
+ new_sd[k] = v
+
+ new_alphas = {}
+ for k,v in new_sd.items():
+ if "lora_B" in k:
+ dim = v.shape[1]
+ elif "lora_A" in k:
+ dim = v.shape[0]
+ else:
+ continue
+ alpha_key = k[:-len("lora_X.weight")] +"alpha"
+ if alpha_key in alphas:
+ scale = alphas[alpha_key] / dim
+ new_alphas[alpha_key] = scale
+ else:
+ print(f"Lora alpha'{alpha_key}' is missing")
+ new_sd.update(new_alphas)
+ sd = new_sd
+
+ if "text2video" in model_filename:
+ new_sd = {}
+ # convert loras for i2v to t2v
+ for k,v in sd.items():
+ if any(layer in k for layer in ["cross_attn.k_img", "cross_attn.v_img"]):
+ continue
+ new_sd[k] = v
+ sd = new_sd
+
+ return sd
r"""
Wan diffusion backbone supporting both text-to-video and image-to-video.
"""
diff --git a/wan/modules/vae.py b/wan/modules/vae.py
index 43e5a577369c04dcf918e06d32a386efeef6e1cf..3c5f34529bb91c55cf5f644e1a4ea98656be3918 100644
--- a/wan/modules/vae.py
+++ b/wan/modules/vae.py
@@ -784,7 +784,32 @@ class WanVAE:
pretrained_path=vae_pth,
z_dim=z_dim,
).to(dtype).eval() #.requires_grad_(False).to(device)
-
+ self.model._model_dtype = dtype
+
+ @staticmethod
+ def get_VAE_tile_size(vae_config, device_mem_capacity, mixed_precision):
+ # VAE Tiling
+ if vae_config == 0:
+ if mixed_precision:
+ device_mem_capacity = device_mem_capacity / 2
+ if device_mem_capacity >= 24000:
+ use_vae_config = 1
+ elif device_mem_capacity >= 8000:
+ use_vae_config = 2
+ else:
+ use_vae_config = 3
+ else:
+ use_vae_config = vae_config
+
+ if use_vae_config == 1:
+ VAE_tile_size = 0
+ elif use_vae_config == 2:
+ VAE_tile_size = 256
+ else:
+ VAE_tile_size = 128
+
+ return VAE_tile_size
+
def encode(self, videos, tile_size = 256, any_end_frame = False):
"""
videos: A list of videos each with shape [C, T, H, W].
diff --git a/wan/text2video.py b/wan/text2video.py
index 7fd7935a329a41503098df951c16de372966f92c..b2c30aa809807f2a03bd39af93822eb4aa68ebf9 100644
--- a/wan/text2video.py
+++ b/wan/text2video.py
@@ -40,7 +40,7 @@ def optimized_scale(positive_flat, negative_flat):
st_star = dot_product / squared_norm
return st_star
-
+
class WanT2V:
@@ -77,20 +77,21 @@ class WanT2V:
self.vae = WanVAE(
vae_pth=os.path.join(checkpoint_dir, config.vae_checkpoint), dtype= VAE_dtype,
device=self.device)
-
+
logging.info(f"Creating WanModel from {model_filename[-1]}")
from mmgp import offload
- # model_filename
-
- self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer, writable_tensors= False ) #, forcedConfigPath= "e:/vace_config.json")
+ # model_filename = "c:/temp/vace/diffusion_pytorch_model-00001-of-00007.safetensors"
+ # model_filename = "vace14B_quanto_bf16_int8.safetensors"
+ self.model = offload.fast_load_transformers_model(model_filename, modelClass=WanModel,do_quantize= quantizeTransformer, writable_tensors= False) # , forcedConfigPath= "c:/temp/vace/vace_config.json")
# offload.load_model_data(self.model, "e:/vace.safetensors")
# offload.load_model_data(self.model, "c:/temp/Phantom-Wan-1.3B.pth")
# self.model.to(torch.bfloat16)
# self.model.cpu()
self.model.lock_layers_dtypes(torch.float32 if mixed_precision_transformer else dtype)
+ # dtype = torch.bfloat16
offload.change_dtype(self.model, dtype, True)
- # offload.save_model(self.model, "mvace.safetensors", config_file_path="e:/vace_config.json")
- # offload.save_model(self.model, "phantom_1.3B.safetensors")
+ # offload.save_model(self.model, "vace14B_bf16.safetensors", config_file_path="c:/temp/vace/vace_config.json")
+ # offload.save_model(self.model, "vace14B_quanto_fp16_int8.safetensors", do_quantize= True, config_file_path="c:/temp/vace/vace_config.json")
self.model.eval().requires_grad_(False)
@@ -274,10 +275,11 @@ class WanT2V:
input_frames= None,
input_masks = None,
input_ref_images = None,
- source_video=None,
+ input_video=None,
target_camera=None,
context_scale=1.0,
- size=(1280, 720),
+ width = 1280,
+ height = 720,
fit_into_canvas = True,
frame_num=81,
shift=5.0,
@@ -298,7 +300,8 @@ class WanT2V:
cfg_zero_step = 5,
overlapped_latents = 0,
overlap_noise = 0,
- vace = False
+ model_filename = None,
+ **bbargs
):
r"""
Generates video frames from text prompt using diffusion process.
@@ -334,6 +337,7 @@ class WanT2V:
- W: Frame width from size)
"""
# preprocess
+ vace = "Vace" in model_filename
if n_prompt == "":
n_prompt = self.sample_neg_prompt
@@ -351,11 +355,12 @@ class WanT2V:
phantom = False
if target_camera != None:
- size = (source_video.shape[2], source_video.shape[1])
- source_video = source_video.to(dtype=self.dtype , device=self.device)
- source_video = source_video.permute(3, 0, 1, 2).div_(127.5).sub_(1.)
- source_latents = self.vae.encode([source_video])[0] #.to(dtype=self.dtype, device=self.device)
- del source_video
+ width = input_video.shape[2]
+ height = input_video.shape[1]
+ input_video = input_video.to(dtype=self.dtype , device=self.device)
+ input_video = input_video.permute(3, 0, 1, 2).div_(127.5).sub_(1.)
+ source_latents = self.vae.encode([input_video])[0] #.to(dtype=self.dtype, device=self.device)
+ del input_video
# Process target camera (recammaster)
from wan.utils.cammmaster_tools import get_camera_embedding
cam_emb = get_camera_embedding(target_camera)
@@ -380,8 +385,8 @@ class WanT2V:
input_ref_images_neg = torch.zeros_like(input_ref_images)
F = frame_num
target_shape = (self.vae.model.z_dim, (F - 1) // self.vae_stride[0] + 1 + (input_ref_images.shape[1] if input_ref_images != None else 0),
- size[1] // self.vae_stride[1],
- size[0] // self.vae_stride[2])
+ height // self.vae_stride[1],
+ width // self.vae_stride[2])
seq_len = math.ceil((target_shape[2] * target_shape[3]) /
(self.patch_size[1] * self.patch_size[2]) *
@@ -560,5 +565,5 @@ class WanT2V:
target = modules_dict[f"blocks.{model_layer}"]
setattr(target, "vace", module )
delattr(model, "vace_blocks")
-
+
\ No newline at end of file
diff --git a/wan/utils/utils.py b/wan/utils/utils.py
index 8cedff5d23ebd48532330f59cfa8381c922a0607..e5c471cdd8fde9bf98acf716cf30c84e2cd79aad 100644
--- a/wan/utils/utils.py
+++ b/wan/utils/utils.py
@@ -13,7 +13,7 @@ import torchvision
from PIL import Image
import numpy as np
from rembg import remove, new_session
-
+import random
__all__ = ['cache_video', 'cache_image', 'str2bool']
@@ -21,10 +21,21 @@ __all__ = ['cache_video', 'cache_image', 'str2bool']
from PIL import Image
-
+def seed_everything(seed: int):
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ if torch.cuda.is_available():
+ torch.cuda.manual_seed(seed)
+ if torch.backends.mps.is_available():
+ torch.mps.manual_seed(seed)
+
def resample(video_fps, video_frames_count, max_target_frames_count, target_fps, start_target_frame ):
import math
+ if video_fps < target_fps :
+ video_fps = target_fps
+
video_frame_duration = 1 /video_fps
target_frame_duration = 1 / target_fps
@@ -67,7 +78,7 @@ def remove_background(img, session=None):
return torch.from_numpy(np.array(img).astype(np.float32) / 255.0).movedim(-1, 0)
-def calculate_new_dimensions(canvas_height, canvas_width, height, width, fit_into_canvas):
+def calculate_new_dimensions(canvas_height, canvas_width, height, width, fit_into_canvas, block_size = 16):
if fit_into_canvas:
scale1 = min(canvas_height / height, canvas_width / width)
scale2 = min(canvas_width / height, canvas_height / width)
@@ -75,8 +86,8 @@ def calculate_new_dimensions(canvas_height, canvas_width, height, width, fit_int
else:
scale = (canvas_height * canvas_width / (height * width))**(1/2)
- new_height = round( height * scale / 16) * 16
- new_width = round( width * scale / 16) * 16
+ new_height = round( height * scale / block_size) * block_size
+ new_width = round( width * scale / block_size) * block_size
return new_height, new_width
def resize_and_remove_background(img_list, budget_width, budget_height, rm_background, fit_into_canvas = False ):
diff --git a/wgp.py b/wgp.py
index c02161b4b39edb5dee433d6b5f66882dc0ffb6fe..c42be416f22c68d87f070092865f6945a763cbf9 100644
--- a/wgp.py
+++ b/wgp.py
@@ -33,6 +33,8 @@ import tempfile
import atexit
import shutil
import glob
+from transformers.utils import logging
+logging.set_verbosity_error
from tqdm import tqdm
import requests
@@ -40,7 +42,9 @@ global_queue_ref = []
AUTOSAVE_FILENAME = "queue.zip"
PROMPT_VARS_MAX = 10
-target_mmgp_version = "3.4.4"
+target_mmgp_version = "3.4.5"
+prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer = None, None, None, None
+
from importlib.metadata import version
mmgp_version = version("mmgp")
if mmgp_version != target_mmgp_version:
@@ -179,20 +183,30 @@ def process_prompt_and_add_tasks(state, model_choice):
gr.Info("You must use the 14B model to generate videos with a resolution equivalent to 720P")
return
- if "diffusion_forcing" in model_filename or "Vace" in model_filename:
+ if "diffusion_forcing" in model_filename or "ltxv" in model_filename or "Vace" in model_filename:
video_length = inputs["video_length"]
sliding_window_size = inputs["sliding_window_size"]
if video_length > sliding_window_size:
gr.Info(f"The Number of Frames to generate ({video_length}) is greater than the Sliding Window Size ({sliding_window_size}) , multiple Windows will be generated")
- if "phantom" in model_filename:
+ if "phantom" in model_filename or "hunyuan_video_custom" in model_filename:
image_refs = inputs["image_refs"]
+ if image_refs == None :
+ gr.Info("You must provide an Image Reference")
+ return
+ if len(image_refs) > 1 and "hunyuan_video_custom" in model_filename:
+ gr.Info("Only one Image Reference (a person) is supported for the moment by Hunyuan Custom")
+ return
+ if any(isinstance(image[0], str) for image in image_refs) :
+ gr.Info("Reference Image should be an Image")
+ return
+
if isinstance(image_refs, list):
image_refs = [ convert_image(tup[0]) for tup in image_refs ]
- os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
- from wan.utils.utils import resize_and_remove_background
- image_refs = resize_and_remove_background(image_refs, width, height, inputs["remove_background_image_ref"] ==1, fit_into_canvas= True)
+ # os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
+ # from wan.utils.utils import resize_and_remove_background
+ # image_refs = resize_and_remove_background(image_refs, width, height, inputs["remove_background_image_ref"] ==1, fit_into_canvas= True)
if len(prompts) > 0:
@@ -206,8 +220,9 @@ def process_prompt_and_add_tasks(state, model_choice):
inputs.update(extra_inputs)
add_video_task(**inputs)
- elif "diffusion_forcing" in model_filename:
+ elif "diffusion_forcing" in model_filename or "ltxv" in model_filename:
image_start = inputs["image_start"]
+ image_end = inputs["image_end"]
video_source = inputs["video_source"]
keep_frames_video_source = inputs["keep_frames_video_source"]
image_prompt_type = inputs["image_prompt_type"]
@@ -237,8 +252,23 @@ def process_prompt_and_add_tasks(state, model_choice):
image_start = [ convert_image(tup[0]) for tup in image_start ]
video_source = None
+ if "E" in image_prompt_type:
+ if image_end == None :
+ gr.Info("You must provide an End Image")
+ return
+ if len(image_end) > 1:
+ gr.Info("Only one End Image is supported for the moment")
+ return
+ if isinstance(image_end[0][0], str) :
+ gr.Info("End Image should be an Image")
+ return
+
+ image_end = [ convert_image(tup[0]) for tup in image_end ]
+ video_source = None
+
if "T" in image_prompt_type:
image_start = None
+ image_end = None
video_source = None
if len(prompts) > 0:
@@ -248,6 +278,7 @@ def process_prompt_and_add_tasks(state, model_choice):
extra_inputs = {
"prompt" : single_prompt,
"image_start" : image_start,
+ "image_end" : image_end,
"video_source" : video_source,
}
inputs.update(extra_inputs)
@@ -318,9 +349,9 @@ def process_prompt_and_add_tasks(state, model_choice):
if isinstance(image_refs, list):
image_refs = [ convert_image(tup[0]) for tup in image_refs ]
- os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
- from wan.utils.utils import resize_and_remove_background
- image_refs = resize_and_remove_background(image_refs, width, height, inputs["remove_background_image_ref"] ==1)
+ # os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
+ # from wan.utils.utils import resize_and_remove_background
+ # image_refs = resize_and_remove_background(image_refs, width, height, inputs["remove_background_image_ref"] ==1)
if len(prompts) > 0:
@@ -465,6 +496,14 @@ def add_video_task(**inputs):
})
return update_queue_data(queue)
+def update_task_thumbnails(task, inputs):
+ start_image_data, end_image_data = get_preview_images(inputs)
+
+ task.update({
+ "start_image_data_base64": [pil_to_base64_uri(img, format="jpeg", quality=70) for img in start_image_data] if start_image_data != None else None,
+ "end_image_data_base64": [pil_to_base64_uri(img, format="jpeg", quality=70) for img in end_image_data] if end_image_data != None else None
+ })
+
def move_up(queue, selected_indices):
if not selected_indices or len(selected_indices) == 0:
return update_queue_data(queue)
@@ -1128,7 +1167,7 @@ def _parse_args():
"--lora-dir-i2v",
type=str,
default="",
- help="Path to a directory that contains Loras for i2v"
+ help="Path to a directory that contains Wan i2v Loras "
)
@@ -1136,9 +1175,33 @@ def _parse_args():
"--lora-dir",
type=str,
default="",
- help="Path to a directory that contains Loras"
+ help="Path to a directory that contains Wan t2v Loras"
+ )
+
+ parser.add_argument(
+ "--lora-dir-hunyuan",
+ type=str,
+ default="loras_hunyuan",
+ help="Path to a directory that contains Hunyuan Video t2v Loras"
+ )
+
+ parser.add_argument(
+ "--lora-dir-hunyuan-i2v",
+ type=str,
+ default="loras_hunyuan_i2v",
+ help="Path to a directory that contains Hunyuan Video i2v Loras"
)
+
+ parser.add_argument(
+ "--lora-dir-ltxv",
+ type=str,
+ default="loras_ltxv",
+ help="Path to a directory that contains LTX Videos Loras"
+ )
+
+
+
parser.add_argument(
"--check-loras",
action="store_true",
@@ -1215,6 +1278,12 @@ def _parse_args():
help="For using fp16 transformer model"
)
+ parser.add_argument(
+ "--bf16",
+ action="store_true",
+ help="For using bf16 transformer model"
+ )
+
parser.add_argument(
"--server-port",
type=str,
@@ -1342,29 +1411,47 @@ def _parse_args():
return args
def get_lora_dir(model_filename):
- lora_dir =args.lora_dir
+
+ model_family = get_model_family(model_filename)
i2v = test_class_i2v(model_filename)
- if i2v and len(lora_dir)==0:
- lora_dir =args.lora_dir_i2v
- if len(lora_dir) > 0:
- return lora_dir
-
- root_lora_dir = "loras_i2v" if i2v else "loras"
-
- if "1.3B" in model_filename :
- lora_dir_1_3B = os.path.join(root_lora_dir, "1.3B")
- if os.path.isdir(lora_dir_1_3B ):
- return lora_dir_1_3B
+ if model_family == "wan":
+ lora_dir =args.lora_dir
+ if i2v and len(lora_dir)==0:
+ lora_dir =args.lora_dir_i2v
+ if len(lora_dir) > 0:
+ return lora_dir
+ root_lora_dir = "loras_i2v" if i2v else "loras"
+
+ if "1.3B" in model_filename :
+ lora_dir_1_3B = os.path.join(root_lora_dir, "1.3B")
+ if os.path.isdir(lora_dir_1_3B ):
+ return lora_dir_1_3B
+ else:
+ lora_dir_14B = os.path.join(root_lora_dir, "14B")
+ if os.path.isdir(lora_dir_14B ):
+ return lora_dir_14B
+ return root_lora_dir
+ elif model_family == "ltxv":
+ return args.lora_dir_ltxv
+ elif model_family =="hunyuan":
+ if i2v:
+ return args.lora_dir_hunyuan_i2v
+ else:
+ return args.lora_dir_hunyuan
else:
- lora_dir_14B = os.path.join(root_lora_dir, "14B")
- if os.path.isdir(lora_dir_14B ):
- return lora_dir_14B
- return root_lora_dir
-
+ raise Exception("loras unknown")
attention_modes_installed = get_attention_modes()
attention_modes_supported = get_supported_attention_modes()
args = _parse_args()
+
+major, minor = torch.cuda.get_device_capability(args.gpu if len(args.gpu) > 0 else None)
+if major < 8:
+ print("Switching to FP16 models when possible as GPU architecture doesn't support optimed BF16 Kernels")
+ bfloat16_supported = False
+else:
+ bfloat16_supported = True
+
args.flow_reverse = True
processing_device = args.gpu
if len(processing_device) == 0:
@@ -1381,7 +1468,6 @@ quantizeTransformer = args.quantize_transformer
check_loras = args.check_loras ==1
advanced = args.advanced
-text_encoder_choices = ["ckpts/models_t5_umt5-xxl-enc-bf16.safetensors", "ckpts/models_t5_umt5-xxl-enc-quanto_int8.safetensors"]
server_config_filename = "wgp_config.json"
if not os.path.isdir("settings"):
os.mkdir("settings")
@@ -1393,11 +1479,19 @@ if os.path.isfile("t2v_settings.json"):
if not os.path.isfile(server_config_filename) and os.path.isfile("gradio_config.json"):
shutil.move("gradio_config.json", server_config_filename)
+src_move = [ "ckpts/models_clip_open-clip-xlm-roberta-large-vit-huge-14-bf16.safetensors", "ckpts/models_t5_umt5-xxl-enc-bf16.safetensors", "ckpts/models_t5_umt5-xxl-enc-quanto_int8.safetensors" ]
+tgt_move = [ "ckpts/xlm-roberta-large/", "ckpts/umt5-xxl/", "ckpts/umt5-xxl/"]
+for src,tgt in zip(src_move,tgt_move):
+ if os.path.isfile(src):
+ shutil.move(src, tgt)
+
+
+
if not Path(server_config_filename).is_file():
server_config = {"attention_mode" : "auto",
"transformer_types": [],
"transformer_quantization": "int8",
- "text_encoder_filename" : text_encoder_choices[1],
+ "text_encoder_quantization" : "int8",
"save_path": "outputs", #os.path.join(os.getcwd(),
"compile" : "",
"metadata_type": "metadata",
@@ -1426,26 +1520,31 @@ for path in ["wan2.1_Vace_1.3B_preview_bf16.safetensors", "sky_reels2_diffusion
os.remove( os.path.join("ckpts" , path))
-transformer_choices_t2v=["ckpts/wan2.1_text2video_1.3B_bf16.safetensors", "ckpts/wan2.1_text2video_14B_bf16.safetensors", "ckpts/wan2.1_text2video_14B_quanto_int8.safetensors", "ckpts/wan2.1_Vace_1.3B_preview_mbf16.safetensors",
+wan_choices_t2v=["ckpts/wan2.1_text2video_1.3B_bf16.safetensors", "ckpts/wan2.1_text2video_14B_bf16.safetensors", "ckpts/wan2.1_text2video_14B_quanto_int8.safetensors", "ckpts/wan2.1_Vace_1.3B_preview_mbf16.safetensors",
"ckpts/wan2.1_recammaster_1.3B_bf16.safetensors", "ckpts/sky_reels2_diffusion_forcing_1.3B_mbf16.safetensors", "ckpts/sky_reels2_diffusion_forcing_14B_bf16.safetensors",
"ckpts/sky_reels2_diffusion_forcing_14B_quanto_int8.safetensors", "ckpts/sky_reels2_diffusion_forcing_720p_14B_mbf16.safetensors","ckpts/sky_reels2_diffusion_forcing_720p_14B_quanto_mbf16_int8.safetensors",
- "ckpts/wan2_1_phantom_1.3B_mbf16.safetensors"]
-transformer_choices_i2v=["ckpts/wan2.1_image2video_480p_14B_mbf16.safetensors", "ckpts/wan2.1_image2video_480p_14B_quanto_mbf16_int8.safetensors", "ckpts/wan2.1_image2video_720p_14B_mbf16.safetensors",
+ "ckpts/wan2_1_phantom_1.3B_mbf16.safetensors", "ckpts/wan2.1_Vace_14B_mbf16.safetensors", "ckpts/wan2.1_Vace_14B_quanto_mbf16_int8.safetensors"]
+wan_choices_i2v=["ckpts/wan2.1_image2video_480p_14B_mbf16.safetensors", "ckpts/wan2.1_image2video_480p_14B_quanto_mbf16_int8.safetensors", "ckpts/wan2.1_image2video_720p_14B_mbf16.safetensors",
"ckpts/wan2.1_image2video_720p_14B_quanto_mbf16_int8.safetensors", "ckpts/wan2.1_Fun_InP_1.3B_bf16.safetensors", "ckpts/wan2.1_Fun_InP_14B_bf16.safetensors",
"ckpts/wan2.1_Fun_InP_14B_quanto_int8.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_bf16.safetensors", "ckpts/wan2.1_FLF2V_720p_14B_quanto_int8.safetensors",
"ckpts/wan2.1_fantasy_speaking_14B_bf16.safetensors"]
-transformer_choices = transformer_choices_t2v + transformer_choices_i2v
-def get_dependent_models(model_filename, quantization ):
+ltxv_choices= ["ckpts/ltxv_0.9.7_13B_dev_bf16.safetensors", "ckpts/ltxv_0.9.7_13B_dev_quanto_bf16_int8.safetensors"]
+
+hunyuan_choices= ["ckpts/hunyuan_video_720_bf16.safetensors", "ckpts/hunyuan_video_720_quanto_int8.safetensors", "ckpts/hunyuan_video_i2v_720_bf16.safetensors", "ckpts/hunyuan_video_i2v_720_quanto_int8v2.safetensors",
+ "ckpts/hunyuan_video_custom_720_bf16.safetensors", "ckpts/hunyuan_video_custom_720_quanto_bf16_int8.safetensors" ]
+
+transformer_choices = wan_choices_t2v + wan_choices_i2v + ltxv_choices + hunyuan_choices
+def get_dependent_models(model_filename, quantization, dtype_policy ):
if "fantasy" in model_filename:
- return [get_model_filename("i2v_720p", quantization)]
+ return [get_model_filename("i2v_720p", quantization, dtype_policy)]
else:
return []
-model_types = [ "t2v_1.3B", "vace_1.3B", "fun_inp_1.3B", "t2v", "i2v", "i2v_720p", "fun_inp", "recam_1.3B", "flf2v_720p", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "phantom_1.3B", "fantasy"]
+model_types = [ "t2v_1.3B", "vace_1.3B", "fun_inp_1.3B", "t2v", "i2v", "i2v_720p", "vace_14B", "fun_inp", "recam_1.3B", "flf2v_720p", "sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B", "phantom_1.3B", "fantasy", "ltxv_13B", "hunyuan", "hunyuan_i2v", "hunyuan_custom"]
model_signatures = {"t2v": "text2video_14B", "t2v_1.3B" : "text2video_1.3B", "fun_inp_1.3B" : "Fun_InP_1.3B", "fun_inp" : "Fun_InP_14B",
- "i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "recam_1.3B": "recammaster_1.3B",
+ "i2v" : "image2video_480p", "i2v_720p" : "image2video_720p" , "vace_1.3B" : "Vace_1.3B", "vace_14B" : "Vace_14B","recam_1.3B": "recammaster_1.3B",
"flf2v_720p" : "FLF2V_720p", "sky_df_1.3B" : "sky_reels2_diffusion_forcing_1.3B", "sky_df_14B" : "sky_reels2_diffusion_forcing_14B",
"sky_df_720p_14B" : "sky_reels2_diffusion_forcing_720p_14B",
- "phantom_1.3B" : "phantom_1.3B", "fantasy" : "fantasy" }
+ "phantom_1.3B" : "phantom_1.3B", "fantasy" : "fantasy", "ltxv_13B" : "ltxv_0.9.7_13B", "hunyuan" : "hunyuan_video_720", "hunyuan_i2v" : "hunyuan_video_i2v_720", "hunyuan_custom" : "hunyuan_video_custom" }
def get_model_type(model_filename):
@@ -1454,8 +1553,18 @@ def get_model_type(model_filename):
return model_type
raise Exception("Unknown model:" + model_filename)
+def get_model_family(model_filename):
+ if "wan" in model_filename or "sky" in model_filename:
+ return "wan"
+ elif "ltxv" in model_filename:
+ return "ltxv"
+ elif "hunyuan" in model_filename:
+ return "hunyuan"
+ else:
+ raise Exception(f"Unknown model family for model'{model_filename}'")
+
def test_class_i2v(model_filename):
- return "image2video" in model_filename or "Fun_InP" in model_filename or "FLF2V" in model_filename or "fantasy" in model_filename
+ return "image2video" in model_filename or "Fun_InP" in model_filename or "FLF2V" in model_filename or "fantasy" in model_filename or "hunyuan_video_i2v" in model_filename
def get_model_name(model_filename, description_container = [""]):
if "Fun" in model_filename:
@@ -1497,6 +1606,19 @@ def get_model_name(model_filename, description_container = [""]):
model_name = "Wan2.1 Fantasy Speaking 720p"
model_name += " 14B" if "14B" in model_filename else " 1.3B"
description = "The Fantasy Speaking model corresponds to the original Wan image 2 video model combined with the Fantasy Speaking extension to process an audio Input."
+ elif "ltxv" in model_filename:
+ model_name = "LTX Video"
+ model_name += " 0.9.7 13B" if "13B" in model_filename else " 0.9.6 2B"
+ description = "LTX Video is a fast model that can be used to generate long videos (up to 260 frames).It is recommended to keep the number of steps to 30 or you will need to update the file 'ltxv_video/configs/ltxv-13b-0.9.7-dev.yaml'.The LTX Video model expects very long prompt, so don't hesitate to use the Prompt Enhancer."
+ elif "hunyuan_video_720" in model_filename:
+ model_name = "Hunyuan Video text2video 720p"
+ description = "Probably the best text 2 video model available."
+ elif "hunyuan_video_i2v" in model_filename:
+ model_name = "Hunyuan Video image2video 720p"
+ description = "A good looking image 2 video model, but not so good in prompt adherence."
+ elif "hunyuan_video_custom" in model_filename:
+ model_name = "Hunyuan Video Custom 720p"
+ description = "The Hunyuan Video Custom model is proably the best model to transfer people (only people for the momment) as it is quite good to keep their identity. However it is slow as to get good results, you to generate 720p videos with 30 steps."
else:
model_name = "Wan2.1 text2video"
model_name += " 14B" if "14B" in model_filename else " 1.3B"
@@ -1505,29 +1627,50 @@ def get_model_name(model_filename, description_container = [""]):
return model_name
-def get_model_filename(model_type, quantization):
+def get_model_filename(model_type, quantization ="int8", dtype_policy = ""):
signature = model_signatures[model_type]
-
choices = [ name for name in transformer_choices if signature in name]
if len(quantization) == 0:
quantization = "bf16"
+ model_family = get_model_family(choices[0])
+ dtype = get_transformer_dtype(model_family, dtype_policy)
if len(choices) <= 1:
raw_filename = choices[0]
else:
sub_choices = [ name for name in choices if quantization in name]
if len(sub_choices) > 0:
+ dtype_str = "fp16" if dtype == torch.float16 else "bf16"
+ new_sub_choices = [ name for name in sub_choices if dtype_str in name]
+ sub_choices = new_sub_choices if len(new_sub_choices) > 0 else sub_choices
raw_filename = sub_choices[0]
else:
raw_filename = choices[0]
-
- if transformer_dtype == torch.float16 :
+
+ if dtype == torch.float16 and not "fp16" in raw_filename and model_family == "wan" :
if "quanto_int8" in raw_filename:
raw_filename = raw_filename.replace("quanto_int8", "quanto_fp16_int8")
- elif "quanto_mbf16_int8":
+ elif "quanto_bf16_int8" in raw_filename:
+ raw_filename = raw_filename.replace("quanto_bf16_int8", "quanto_fp16_int8")
+ elif "quanto_mbf16_int8" in raw_filename:
raw_filename= raw_filename.replace("quanto_mbf16_int8", "quanto_mfp16_int8")
return raw_filename
+def get_transformer_dtype(model_family, transformer_dtype_policy):
+ if len(transformer_dtype_policy) == 0:
+ if not bfloat16_supported:
+ return torch.float16
+ else:
+ if model_family == "wan"and False:
+ return torch.float16
+ else:
+ return torch.bfloat16
+ return transformer_dtype
+ elif transformer_dtype_policy =="fp16":
+ return torch.float16
+ else:
+ return torch.bfloat16
+
def get_settings_file_name(model_filename):
return os.path.join(args.settings, get_model_type(model_filename) + "_settings.json")
@@ -1549,6 +1692,7 @@ def get_default_settings(filename):
"repeat_generation": 1,
"multi_images_gen_type": 0,
"guidance_scale": 5.0,
+ "embedded_guidance_scale" : 6.0,
"audio_guidance_scale": 5.0,
"flow_shift": get_default_flow(filename, i2v),
"negative_prompt": "",
@@ -1563,6 +1707,11 @@ def get_default_settings(filename):
"slg_end_perc": 90
}
+ if get_model_type(filename) in ("hunyuan","hunyuan_i2v"):
+ ui_defaults.update({
+ "guidance_scale": 7.0,
+ })
+
if get_model_type(filename) in ("sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B"):
ui_defaults.update({
"guidance_scale": 6.0,
@@ -1583,6 +1732,13 @@ def get_default_settings(filename):
"resolution": "1280x720"
})
+ elif get_model_type(filename) in ("hunyuan_video_custom"):
+ ui_defaults.update({
+ "guidance_scale": 7.5,
+ "flow_shift": 13,
+ "resolution": "1280x720"
+ })
+
with open(defaults_filename, "w", encoding="utf-8") as f:
@@ -1608,18 +1764,18 @@ def get_default_settings(filename):
ui_defaults["num_inference_steps"] = default_number_steps
return ui_defaults
-major, minor = torch.cuda.get_device_capability(args.gpu if len(args.gpu) > 0 else None)
-if major < 8:
- print("Switching to f16 models as GPU architecture doesn't support bf16")
- transformer_dtype = torch.float16
-else:
- transformer_dtype = torch.float16 if args.fp16 else torch.bfloat16
-
transformer_types = server_config.get("transformer_types", [])
transformer_type = transformer_types[0] if len(transformer_types) > 0 else model_types[0]
+
transformer_quantization =server_config.get("transformer_quantization", "int8")
-transformer_filename = get_model_filename(transformer_type, transformer_quantization)
-text_encoder_filename = server_config["text_encoder_filename"]
+
+transformer_dtype_policy = server_config.get("transformer_dtype_policy", "")
+if args.fp16:
+ transformer_dtype_policy = "fp16"
+if args.bf16:
+ transformer_dtype_policy = "bf16"
+transformer_filename = get_model_filename(transformer_type, transformer_quantization, transformer_dtype_policy)
+text_encoder_quantization =server_config.get("text_encoder_quantization", "int8")
attention_mode = server_config["attention_mode"]
if len(args.attention)> 0:
if args.attention in ["auto", "sdpa", "sage", "sage2", "flash", "xformers"]:
@@ -1642,19 +1798,19 @@ preload_model_policy = server_config.get("preload_model_policy", [])
if args.t2v_14B or args.t2v:
- transformer_filename = get_model_filename("t2v", transformer_quantization)
+ transformer_filename = get_model_filename("t2v", transformer_quantization, transformer_dtype_policy)
if args.i2v_14B or args.i2v:
- transformer_filename = get_model_filename("i2v", transformer_quantization)
+ transformer_filename = get_model_filename("i2v", transformer_quantization, transformer_dtype_policy)
if args.t2v_1_3B:
- transformer_filename = get_model_filename("t2v_1.3B", transformer_quantization)
+ transformer_filename = get_model_filename("t2v_1.3B", transformer_quantization, transformer_dtype_policy)
if args.i2v_1_3B:
- transformer_filename = get_model_filename("fun_inp_1.3B", transformer_quantization)
+ transformer_filename = get_model_filename("fun_inp_1.3B", transformer_quantization, transformer_dtype_policy)
if args.vace_1_3B:
- transformer_filename = get_model_filename("vace_1.3B", transformer_quantization)
+ transformer_filename = get_model_filename("vace_1.3B", transformer_quantization, transformer_dtype_policy)
only_allow_edit_in_advanced = False
lora_preselected_preset = args.lora_preset
@@ -1672,103 +1828,122 @@ model_filename = ""
#attention_mode="xformers"
# compile = "transformer"
-def preprocess_loras(sd):
- if wan_model == None:
- return sd
- model_filename = wan_model._model_file_name
-
- first = next(iter(sd), None)
- if first == None:
- return sd
+def get_loras_preprocessor(transformer, model_filename):
+ preprocessor = getattr(transformer, "preprocess_loras", None)
+ if preprocessor == None:
+ return None
- if first.startswith("lora_unet_"):
- new_sd = {}
- print("Converting Lora Safetensors format to Lora Diffusers format")
- alphas = {}
- repl_list = ["cross_attn", "self_attn", "ffn"]
- src_list = ["_" + k + "_" for k in repl_list]
- tgt_list = ["." + k + "." for k in repl_list]
+ def preprocessor_wrapper(sd):
+ return preprocessor(model_filename, sd)
- for k,v in sd.items():
- k = k.replace("lora_unet_blocks_","diffusion_model.blocks.")
+ return preprocessor_wrapper
- for s,t in zip(src_list, tgt_list):
- k = k.replace(s,t)
-
- k = k.replace("lora_up","lora_B")
- k = k.replace("lora_down","lora_A")
-
- if "alpha" in k:
- alphas[k] = v
- else:
- new_sd[k] = v
-
- new_alphas = {}
- for k,v in new_sd.items():
- if "lora_B" in k:
- dim = v.shape[1]
- elif "lora_A" in k:
- dim = v.shape[0]
- else:
- continue
- alpha_key = k[:-len("lora_X.weight")] +"alpha"
- if alpha_key in alphas:
- scale = alphas[alpha_key] / dim
- new_alphas[alpha_key] = scale
- else:
- print(f"Lora alpha'{alpha_key}' is missing")
- new_sd.update(new_alphas)
- sd = new_sd
-
- if "text2video" in model_filename:
- new_sd = {}
- # convert loras for i2v to t2v
- for k,v in sd.items():
- if any(layer in k for layer in ["cross_attn.k_img", "cross_attn.v_img"]):
- continue
- new_sd[k] = v
- sd = new_sd
+
+def get_model_manager(model_family):
+ if model_family == "wan":
+ return None
+ elif model_family == "ltxv":
+ from ltxv import model_def
+ return model_def
+ else:
+ raise Exception("model family not supported")
+
+def get_wan_text_encoder_filename(text_encoder_quantization):
+ text_encoder_filename = "ckpts/umt5-xxl/models_t5_umt5-xxl-enc-bf16.safetensors"
+ if text_encoder_quantization =="int8":
+ text_encoder_filename = text_encoder_filename.replace("bf16", "quanto_int8")
+ return text_encoder_filename
+
+def get_ltxv_text_encoder_filename(text_encoder_quantization):
+ text_encoder_filename = "ckpts/T5_xxl_1.1/T5_xxl_1.1_enc_bf16.safetensors"
+ if text_encoder_quantization =="int8":
+ text_encoder_filename = text_encoder_filename.replace("bf16", "quanto_bf16_int8")
+ return text_encoder_filename
+
+def get_hunyuan_text_encoder_filename(text_encoder_quantization):
+ if text_encoder_quantization =="int8":
+ text_encoder_filename = "ckpts/llava-llama-3-8b/llava-llama-3-8b-v1_1_vlm_fp16.safetensors"
+ else:
+ text_encoder_filename = "ckpts/llava-llama-3-8b/llava-llama-3-8b-v1_1_vlm_quanto_int8.safetensors"
- return sd
+ return text_encoder_filename
-def download_models(transformer_filename, text_encoder_filename):
+def download_models(transformer_filename):
def computeList(filename):
pos = filename.rfind("/")
filename = filename[pos+1:]
return [filename]
-
+
+ def process_files_def(repoId, sourceFolderList, fileList):
+ targetRoot = "ckpts/"
+ for sourceFolder, files in zip(sourceFolderList,fileList ):
+ if len(files)==0:
+ if not Path(targetRoot + sourceFolder).exists():
+ snapshot_download(repo_id=repoId, allow_patterns=sourceFolder +"/*", local_dir= targetRoot)
+ else:
+ for onefile in files:
+ if len(sourceFolder) > 0:
+ if not os.path.isfile(targetRoot + sourceFolder + "/" + onefile ):
+ hf_hub_download(repo_id=repoId, filename=onefile, local_dir = targetRoot, subfolder=sourceFolder)
+ else:
+ if not os.path.isfile(targetRoot + onefile ):
+ hf_hub_download(repo_id=repoId, filename=onefile, local_dir = targetRoot)
+
from huggingface_hub import hf_hub_download, snapshot_download
- repoId = "DeepBeepMeep/Wan2.1"
- sourceFolderList = ["xlm-roberta-large", "pose", "depth", "mask", "wav2vec", "" ]
- fileList = [ [], [],[], ["sam_vit_h_4b8939_fp16.safetensors"], ["config.json", "feature_extractor_config.json", "model.safetensors", "preprocessor_config.json", "special_tokens_map.json", "tokenizer_config.json", "vocab.json"],
- ["Wan2.1_VAE.safetensors", "models_clip_open-clip-xlm-roberta-large-vit-huge-14-bf16.safetensors", "flownet.pkl", "fantasy_proj_model.safetensors" ] + computeList(text_encoder_filename) + computeList(transformer_filename) ]
- targetRoot = "ckpts/"
- for sourceFolder, files in zip(sourceFolderList,fileList ):
- if len(files)==0:
- if not Path(targetRoot + sourceFolder).exists():
- snapshot_download(repo_id=repoId, allow_patterns=sourceFolder +"/*", local_dir= targetRoot)
- else:
- for onefile in files:
- if len(sourceFolder) > 0:
- if not os.path.isfile(targetRoot + sourceFolder + "/" + onefile ):
- hf_hub_download(repo_id=repoId, filename=onefile, local_dir = targetRoot, subfolder=sourceFolder)
- else:
- if not os.path.isfile(targetRoot + onefile ):
- hf_hub_download(repo_id=repoId, filename=onefile, local_dir = targetRoot)
+
+ shared_def = {
+ "repoId" : "DeepBeepMeep/Wan2.1",
+ "sourceFolderList" : [ "pose", "depth", "mask", "wav2vec", "" ],
+ "fileList" : [ [],[], ["sam_vit_h_4b8939_fp16.safetensors"], ["config.json", "feature_extractor_config.json", "model.safetensors", "preprocessor_config.json", "special_tokens_map.json", "tokenizer_config.json", "vocab.json"],
+ [ "flownet.pkl" ] ]
+ }
+ process_files_def(**shared_def)
+
+
+ if server_config.get("enhancer_enabled", 0) == 1:
+ enhancer_def = {
+ "repoId" : "DeepBeepMeep/LTX_Video",
+ "sourceFolderList" : [ "Florence2", "Llama3_2" ],
+ "fileList" : [ ["config.json", "configuration_florence2.py", "model.safetensors", "modeling_florence2.py", "preprocessor_config.json", "processing_florence2.py", "tokenizer.json", "tokenizer_config.json"],["config.json", "generation_config.json", "Llama3_2_quanto_bf16_int8.safetensors", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json"] ]
+ }
+ process_files_def(**enhancer_def)
+
+
+ model_family = get_model_family(transformer_filename)
+ if model_family == "wan":
+ text_encoder_filename = get_wan_text_encoder_filename(text_encoder_quantization)
+ model_def = {
+ "repoId" : "DeepBeepMeep/Wan2.1",
+ "sourceFolderList" : ["xlm-roberta-large", "umt5-xxl", "" ],
+ "fileList" : [ [ "models_clip_open-clip-xlm-roberta-large-vit-huge-14-bf16.safetensors", "sentencepiece.bpe.model", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json"], ["special_tokens_map.json", "spiece.model", "tokenizer.json", "tokenizer_config.json"] + computeList(text_encoder_filename) , ["Wan2.1_VAE.safetensors", "fantasy_proj_model.safetensors" ] + computeList(transformer_filename) ]
+ }
+ elif model_family == "ltxv":
+ text_encoder_filename = get_ltxv_text_encoder_filename(text_encoder_quantization)
+ model_def = {
+ "repoId" : "DeepBeepMeep/LTX_Video",
+ "sourceFolderList" : ["T5_xxl_1.1", "" ],
+ "fileList" : [ ["added_tokens.json", "special_tokens_map.json", "spiece.model", "tokenizer_config.json"] + computeList(text_encoder_filename), ["ltxv_0.9.7_VAE.safetensors", "ltxv_0.9.7_spatial_upscaler.safetensors", "ltxv_scheduler.json"] + computeList(transformer_filename) ]
+ }
+ elif model_family == "hunyuan":
+ text_encoder_filename = get_hunyuan_text_encoder_filename(text_encoder_quantization)
+ model_def = {
+ "repoId" : "DeepBeepMeep/HunyuanVideo",
+ "sourceFolderList" : [ "llava-llama-3-8b", "clip_vit_large_patch14" "" ],
+ "fileList" :[ ["config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "preprocessor_config.json"] + computeList(text_encoder_filename) , ["config.json", "model.safetensors", "preprocessor_config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "vocab.json"], [ "hunyuan_video_720_quanto_int8_map.json", "hunyuan_video_custom_VAE_fp32.safetensors", "hunyuan_video_custom_VAE_config.json", "hunyuan_video_VAE_fp32.safetensors", "hunyuan_video_VAE_config.json" , "hunyuan_video_720_quanto_int8_map.json" ] + computeList(transformer_filename) ]
+ }
+
+ else:
+ model_manager = get_model_manager(model_family)
+ model_def = model_manager.get_files_def(transformer_filename, text_encoder_quantization)
+
+ process_files_def(**model_def)
offload.default_verboseLevel = verbose_level
-to_remove = ["models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", "Wan2.1_VAE.pth"]
-for file_name in to_remove:
- file_name = os.path.join("ckpts",file_name)
- if os.path.isfile(file_name):
- try:
- os.remove(file_name)
- except:
- pass
-download_models(transformer_filename, text_encoder_filename)
+
+# download_models(transformer_filename)
def sanitize_file_name(file_name, rep =""):
return file_name.replace("/",rep).replace("\\",rep).replace(":",rep).replace("|",rep).replace("?",rep).replace("<",rep).replace(">",rep).replace("\"",rep).replace("\n",rep).replace("\r",rep)
@@ -1841,7 +2016,7 @@ def setup_loras(model_filename, transformer, lora_dir, lora_preselected_preset,
loras_presets = [ Path(Path(file_path).parts[-1]).stem for file_path in dir_presets]
if transformer !=None:
- loras = offload.load_loras_into_model(transformer, loras, activate_all_loras=False, check_only= True, preprocess_sd=preprocess_loras, split_linear_modules_map = split_linear_modules_map) #lora_multiplier,
+ loras = offload.load_loras_into_model(transformer, loras, activate_all_loras=False, check_only= True, preprocess_sd=get_loras_preprocessor(transformer, model_filename), split_linear_modules_map = split_linear_modules_map) #lora_multiplier,
if len(loras) > 0:
loras_names = [ Path(lora).stem for lora in loras ]
@@ -1856,77 +2031,139 @@ def setup_loras(model_filename, transformer, lora_dir, lora_preselected_preset,
return loras, loras_names, loras_presets, default_loras_choices, default_loras_multis_str, default_lora_preset_prompt, default_lora_preset
-def load_t2v_model(model_filename, quantizeTransformer = False, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False):
-
- cfg = WAN_CONFIGS['t2v-14B']
+def load_wan_model(model_filename, quantizeTransformer = False, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False):
filename = model_filename[-1]
- # cfg = WAN_CONFIGS['t2v-1.3B']
print(f"Loading '{filename}' model...")
- if get_model_type(filename) in ("sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B"):
- model_factory = wan.DTT2V
+
+ if test_class_i2v(model_filename[0]):
+ cfg = WAN_CONFIGS['i2v-14B']
+ model_factory = wan.WanI2V
else:
- model_factory = wan.WanT2V
+ cfg = WAN_CONFIGS['t2v-14B']
+ # cfg = WAN_CONFIGS['t2v-1.3B']
+ if get_model_type(filename) in ("sky_df_1.3B", "sky_df_14B", "sky_df_720p_14B"):
+ model_factory = wan.DTT2V
+ else:
+ model_factory = wan.WanT2V
wan_model = model_factory(
config=cfg,
checkpoint_dir="ckpts",
model_filename=model_filename,
- text_encoder_filename= text_encoder_filename,
+ text_encoder_filename= get_wan_text_encoder_filename(text_encoder_quantization),
quantizeTransformer = quantizeTransformer,
dtype = dtype,
VAE_dtype = VAE_dtype,
mixed_precision_transformer = mixed_precision_transformer
)
- pipe = {"transformer": wan_model.model, "text_encoder" : wan_model.text_encoder.model, "vae": wan_model.vae.model }
-
+ pipe = {"transformer": wan_model.model, "text_encoder" : wan_model.text_encoder.model, "vae": wan_model.vae.model }
+ if hasattr(wan_model, "clip"):
+ pipe["text_encoder_2"] = wan_model.clip.model
return wan_model, pipe
-def load_i2v_model(model_filename, quantizeTransformer = False, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False):
+def load_ltxv_model(model_filename, quantizeTransformer = False, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False):
+ filename = model_filename[-1]
+ print(f"Loading '{filename}' model...")
+ from ltx_video.ltxv import LTXV
+
+ ltxv_model = LTXV(
+ model_filepath = model_filename,
+ text_encoder_filepath = get_ltxv_text_encoder_filename(text_encoder_quantization),
+ dtype = dtype,
+ # quantizeTransformer = quantizeTransformer,
+ VAE_dtype = VAE_dtype,
+ mixed_precision_transformer = mixed_precision_transformer
+ )
+
+ pipeline = ltxv_model.pipeline
+ pipe = {"transformer" : pipeline.video_pipeline.transformer, "vae" : pipeline.vae, "text_encoder" : pipeline.video_pipeline.text_encoder, "latent_upsampler" : pipeline.latent_upsampler}
+
+ return ltxv_model, pipe
+def load_hunyuan_model(model_filename, quantizeTransformer = False, dtype = torch.bfloat16, VAE_dtype = torch.float32, mixed_precision_transformer = False):
filename = model_filename[-1]
print(f"Loading '{filename}' model...")
+ from hyvideo.hunyuan import HunyuanVideoSampler
- cfg = WAN_CONFIGS['i2v-14B']
- wan_model = wan.WanI2V(
- config=cfg,
- checkpoint_dir="ckpts",
- model_filename=model_filename,
- text_encoder_filename=text_encoder_filename,
- quantizeTransformer = quantizeTransformer,
+ hunyuan_model = HunyuanVideoSampler.from_pretrained(
+ model_filepath = model_filename,
+ text_encoder_filepath = get_hunyuan_text_encoder_filename(text_encoder_quantization),
dtype = dtype,
- VAE_dtype = VAE_dtype,
+ # quantizeTransformer = quantizeTransformer,
+ VAE_dtype = VAE_dtype,
mixed_precision_transformer = mixed_precision_transformer
- )
- pipe = {"transformer": wan_model.model, "text_encoder" : wan_model.text_encoder.model, "text_encoder_2": wan_model.clip.model, "vae": wan_model.vae.model } #
+ )
- return wan_model, pipe
+ pipe = { "transformer" : hunyuan_model.model, "text_encoder" : hunyuan_model.text_encoder, "text_encoder_2" : hunyuan_model.text_encoder_2, "vae" : hunyuan_model.vae }
+
+ from hyvideo.modules.models import get_linear_split_map
+
+ split_linear_modules_map = get_linear_split_map()
+ hunyuan_model.model.split_linear_modules_map = split_linear_modules_map
+ offload.split_linear_modules(hunyuan_model.model, split_linear_modules_map )
+ return hunyuan_model, pipe
+
+def get_transformer_model(model):
+ if hasattr(model, "model"):
+ return model.model
+ elif hasattr(model, "transformer"):
+ return model.transformer
+ else:
+ raise Exception("no transformer found")
+
def load_models(model_filename):
global transformer_filename
-
+ model_family = get_model_family(model_filename)
perc_reserved_mem_max = args.perc_reserved_mem_max
- model_filelist = get_dependent_models(model_filename, quantization= transformer_quantization) + [model_filename]
+ model_filelist = get_dependent_models(model_filename, quantization= transformer_quantization, dtype_policy = transformer_dtype_policy) + [model_filename]
for filename in model_filelist:
- download_models(filename, text_encoder_filename)
+ download_models(filename)
+ transformer_dtype = get_transformer_dtype(model_family, transformer_dtype_policy)
VAE_dtype = torch.float16 if server_config.get("vae_precision","16") == "16" else torch.float
mixed_precision_transformer = server_config.get("mixed_precision","0") == "1"
transformer_filename = None
new_transformer_filename = model_filelist[-1]
- if test_class_i2v(new_transformer_filename):
- wan_model, pipe = load_i2v_model(model_filelist, quantizeTransformer = quantizeTransformer, dtype = transformer_dtype, VAE_dtype = VAE_dtype, mixed_precision_transformer = mixed_precision_transformer)
+ if model_family == "wan" :
+ wan_model, pipe = load_wan_model(model_filelist, quantizeTransformer = quantizeTransformer, dtype = transformer_dtype, VAE_dtype = VAE_dtype, mixed_precision_transformer = mixed_precision_transformer)
+ elif model_family == "ltxv":
+ wan_model, pipe = load_ltxv_model(model_filelist, quantizeTransformer = quantizeTransformer, dtype = transformer_dtype, VAE_dtype = VAE_dtype, mixed_precision_transformer = mixed_precision_transformer)
+ elif model_family == "hunyuan":
+ wan_model, pipe = load_hunyuan_model(model_filelist, quantizeTransformer = quantizeTransformer, dtype = transformer_dtype, VAE_dtype = VAE_dtype, mixed_precision_transformer = mixed_precision_transformer)
else:
- wan_model, pipe = load_t2v_model(model_filelist, quantizeTransformer = quantizeTransformer, dtype = transformer_dtype, VAE_dtype = VAE_dtype, mixed_precision_transformer = mixed_precision_transformer)
+ raise Exception(f"Model '{new_transformer_filename}' not supported.")
wan_model._model_file_name = new_transformer_filename
kwargs = { "extraModelsToQuantize": None}
if profile == 2 or profile == 4:
- kwargs["budgets"] = { "transformer" : 100 if preload == 0 else preload, "text_encoder" : 100 if preload == 0 else preload, "*" : 1000 }
+ kwargs["budgets"] = { "transformer" : 100 if preload == 0 else preload, "text_encoder" : 100 if preload == 0 else preload, "*" : max(3000, preload) }
# if profile == 4:
# kwargs["partialPinning"] = True
elif profile == 3:
kwargs["budgets"] = { "*" : "70%" }
+
+ global prompt_enhancer_image_caption_model, prompt_enhancer_image_caption_processor, prompt_enhancer_llm_model, prompt_enhancer_llm_tokenizer
+ if server_config.get("enhancer_enabled", 0) == 1:
+ from transformers import ( AutoModelForCausalLM, AutoProcessor, AutoTokenizer, LlamaForCausalLM )
+
+
+ prompt_enhancer_image_caption_model = AutoModelForCausalLM.from_pretrained( "ckpts/Florence2", trust_remote_code=True)
+ prompt_enhancer_image_caption_processor = AutoProcessor.from_pretrained( "ckpts/Florence2", trust_remote_code=True)
+ prompt_enhancer_llm_model = offload.fast_load_transformers_model("ckpts/Llama3_2/Llama3_2_quanto_bf16_int8.safetensors")
+ prompt_enhancer_llm_tokenizer = AutoTokenizer.from_pretrained("ckpts/Llama3_2")
+ pipe["prompt_enhancer_image_caption_model"] = prompt_enhancer_image_caption_model
+ pipe["prompt_enhancer_llm_model"] = prompt_enhancer_llm_model
+ prompt_enhancer_image_caption_model._model_dtype = torch.float
+ kwargs["budgets"]["prompt_enhancer_llm_model"] = 5000
+ else:
+ prompt_enhancer_image_caption_model = None
+ prompt_enhancer_image_caption_processor = None
+ prompt_enhancer_llm_model = None
+ prompt_enhancer_llm_tokenizer = None
+
+
offloadobj = offload.profile(pipe, profile_no= profile, compile = compile, quantizeTransformer = quantizeTransformer, loras = "transformer", coTenantsMap= {}, perc_reserved_mem_max = perc_reserved_mem_max , convertWeightsFloatTo = transformer_dtype, **kwargs)
if len(args.gpu) > 0:
torch.set_default_device(args.gpu)
@@ -1974,15 +2211,21 @@ def generate_header(model_filename, compile, attention_mode):
if compile:
header += ", Pytorch compilation ON"
+ if "fp16" in model_filename:
+ header += ", Data Type FP16"
+ else:
+ header += ", Data Type BF16"
+
if "int8" in model_filename:
- header += ", Quantization Int8"
+ header += ", Quantization Scaled Int8"
header += ""
return header
def apply_changes( state,
transformer_types_choices,
- text_encoder_choice,
+ transformer_dtype_policy_choice,
+ text_encoder_quantization_choice,
VAE_precision_choice,
mixed_precision_choice,
save_path_choice,
@@ -1996,6 +2239,7 @@ def apply_changes( state,
clear_file_list = 0,
preload_model_policy_choice = 1,
UI_theme_choice = "default",
+ enhancer_enabled_choice = 0,
fit_canvas_choice = 0
):
if args.lock_config:
@@ -2005,7 +2249,7 @@ def apply_changes( state,
global offloadobj, wan_model, server_config, loras, loras_names, default_loras_choices, default_loras_multis_str, default_lora_preset_prompt, default_lora_preset, loras_presets
server_config = {"attention_mode" : attention_choice,
"transformer_types": transformer_types_choices,
- "text_encoder_filename" : text_encoder_choices[text_encoder_choice],
+ "text_encoder_quantization" : text_encoder_quantization_choice,
"save_path" : save_path_choice,
"compile" : compile_choice,
"profile" : profile_choice,
@@ -2014,11 +2258,13 @@ def apply_changes( state,
"mixed_precision" : mixed_precision_choice,
"metadata_type": metadata_choice,
"transformer_quantization" : quantization_choice,
+ "transformer_dtype_policy" : transformer_dtype_policy_choice,
"boost" : boost_choice,
"clear_file_list" : clear_file_list,
"preload_model_policy" : preload_model_policy_choice,
"UI_theme" : UI_theme_choice,
"fit_canvas": fit_canvas_choice,
+ "enhancer_enabled" : enhancer_enabled_choice,
}
if Path(server_config_filename).is_file():
@@ -2041,25 +2287,28 @@ def apply_changes( state,
if v != v_old:
changes.append(k)
- global attention_mode, profile, compile, text_encoder_filename, vae_config, boost, lora_dir, reload_needed, preload_model_policy, transformer_quantization, transformer_types
+ global attention_mode, profile, compile, vae_config, boost, lora_dir, reload_needed, preload_model_policy, transformer_quantization, transformer_dtype_policy, transformer_types, text_encoder_quantization
attention_mode = server_config["attention_mode"]
profile = server_config["profile"]
compile = server_config["compile"]
- text_encoder_filename = server_config["text_encoder_filename"]
+ text_encoder_quantization = server_config["text_encoder_quantization"]
vae_config = server_config["vae_config"]
boost = server_config["boost"]
preload_model_policy = server_config["preload_model_policy"]
transformer_quantization = server_config["transformer_quantization"]
+ transformer_dtype_policy = server_config["transformer_dtype_policy"]
+ text_encoder_quantization = server_config["text_encoder_quantization"]
transformer_types = server_config["transformer_types"]
+ state["model_filename"] = get_model_filename(get_model_type(state["model_filename"]), transformer_quantization, transformer_dtype_policy)
- if all(change in ["attention_mode", "vae_config", "boost", "save_path", "metadata_type", "clear_file_list", "fit_canvas"] for change in changes ):
+ if all(change in ["attention_mode", "vae_config", "boost", "save_path", "metadata_type", "clear_file_list", "fit_canvas"] for change in changes ):
model_choice = gr.Dropdown()
else:
reload_needed = True
model_choice = generate_dropdown_model_list()
header = generate_header(transformer_filename, compile=compile, attention_mode= attention_mode)
- return "The new configuration has been succesfully applied
", header, model_choice
+ return "The new configuration has been succesfully applied
", header, model_choice, gr.update(visible= server_config["enhancer_enabled"] == 1)
@@ -2070,7 +2319,7 @@ def save_video(final_frames, output_path, fps=24):
assert final_frames.ndim == 4 and final_frames.shape[3] == 3, f"invalid shape: {final_frames} (need t h w c)"
if final_frames.dtype != np.uint8:
final_frames = (final_frames * 255).astype(np.uint8)
- ImageSequenceClip(list(final_frames), fps=fps).write_videofile(output_path, verbose= False, logger = None)
+ ImageSequenceClip(list(final_frames), fps=fps).write_videofile(output_path, verbose= False)
def get_gen_info(state):
@@ -2083,7 +2332,7 @@ def get_gen_info(state):
def build_callback(state, pipe, send_cmd, status, num_inference_steps):
gen = get_gen_info(state)
gen["num_inference_steps"] = num_inference_steps
- def callback(step_idx, latent, force_refresh, read_state = False, override_num_inference_steps = -1):
+ def callback(step_idx, latent, force_refresh, read_state = False, override_num_inference_steps = -1, pass_no = -1):
refresh_id = gen.get("refresh", -1)
if force_refresh or step_idx >= 0:
pass
@@ -2110,7 +2359,17 @@ def build_callback(state, pipe, send_cmd, status, num_inference_steps):
elif step_idx == num_inference_steps:
phase = " - VAE Decoding"
else:
- phase = " - Denoising"
+ if pass_no <=0:
+ phase = " - Denoising"
+ elif pass_no == 1:
+ phase = " - Denoising First Pass"
+ elif pass_no == 2:
+ phase = " - Denoising Second Pass"
+ elif pass_no == 3:
+ phase = " - Denoising Third Pass"
+ else:
+ phase = f" - Denoising {pass_no}th Pass"
+
gen["progress_phase"] = (phase, step_idx)
status_msg = status + phase
if step_idx >= 0:
@@ -2121,7 +2380,8 @@ def build_callback(state, pipe, send_cmd, status, num_inference_steps):
# progress(*progress_args)
send_cmd("progress", progress_args)
if latent != None:
- send_cmd("preview", latent.to("cpu", non_blocking=True))
+ latent = latent.to("cpu", non_blocking=True)
+ send_cmd("preview", latent)
# gen["progress_args"] = progress_args
@@ -2162,7 +2422,11 @@ def refresh_gallery(state): #, msg
prompt = task["prompt"]
params = task["params"]
model_filename = params["model_filename"]
- onemorewindow_visible = "Vace" in model_filename or "diffusion_forcing" in model_filename
+ onemorewindow_visible = "Vace" in model_filename or "diffusion_forcing" in model_filename or "ltxv" in model_filename
+ enhanced = False
+ if prompt.startswith("!enhanced!\n"):
+ enhanced = True
+ prompt = prompt[len("!enhanced!\n"):]
if "\n" in prompt :
prompts = prompt.split("\n")
window_no= gen.get("window_no",1)
@@ -2171,6 +2435,8 @@ def refresh_gallery(state): #, msg
window_no -= 1
prompts[window_no]="" + prompts[window_no] + ""
prompt = "
".join(prompts)
+ if enhanced:
+ prompt = "Enhanced:
" + prompt
start_img_uri = task.get('start_image_data_base64')
start_img_uri = start_img_uri[0] if start_img_uri !=None else None
@@ -2252,7 +2518,7 @@ def get_resampled_video(video_in, start_frame, max_frames, target_fps):
frames_list = reader.get_batch(frame_nos)
return frames_list
-def preprocess_video(process_type, height, width, video_in, max_frames, start_frame=0, fit_canvas = False, target_fps = 16):
+def preprocess_video(process_type, height, width, video_in, max_frames, start_frame=0, fit_canvas = False, target_fps = 16, block_size = 16):
frames_list = get_resampled_video(video_in, start_frame, max_frames, target_fps)
@@ -2267,8 +2533,8 @@ def preprocess_video(process_type, height, width, video_in, max_frames, start_fr
else:
scale = ((height * width ) / (frame_height * frame_width))**(1/2)
- new_height = (int(frame_height * scale) // 16) * 16
- new_width = (int(frame_width * scale) // 16) * 16
+ new_height = (int(frame_height * scale) // block_size) * block_size
+ new_width = (int(frame_width * scale) // block_size) * block_size
# if fit_canvas :
# new_height = height
# new_width = width
@@ -2361,7 +2627,7 @@ def parse_keep_frames_video_guide(keep_frames, video_length):
return frames, error
def generate_video(
- task_id,
+ task,
send_cmd,
prompt,
negative_prompt,
@@ -2405,6 +2671,7 @@ def generate_video(
slg_end_perc,
cfg_star_switch,
cfg_zero_step,
+ prompt_enhancer,
state,
model_filename
@@ -2453,32 +2720,17 @@ def generate_video(
slg_layers = None
offload.shared_state["_attention"] = attn
-
- # VAE Tiling
device_mem_capacity = torch.cuda.get_device_properties(0).total_memory / 1048576
- if vae_config == 0:
- if server_config.get("vae_precision", "16") == "32":
- device_mem_capacity = device_mem_capacity / 2
- if device_mem_capacity >= 24000:
- use_vae_config = 1
- elif device_mem_capacity >= 8000:
- use_vae_config = 2
- else:
- use_vae_config = 3
- else:
- use_vae_config = vae_config
-
- if use_vae_config == 1:
- VAE_tile_size = 0
- elif use_vae_config == 2:
- VAE_tile_size = 256
- else:
- VAE_tile_size = 128
+ VAE_tile_size = wan_model.vae.get_VAE_tile_size(vae_config, device_mem_capacity, server_config.get("vae_precision", "16") == "32")
- trans = wan_model.model
+ trans = get_transformer_model(wan_model)
temp_filename = None
+ prompts = prompt.split("\n")
+ prompts = [part for part in prompts if len(prompt)>0]
+
+
loras = state["loras"]
if len(loras) > 0:
def is_float(element: any) -> bool:
@@ -2514,7 +2766,8 @@ def generate_video(
list_mult_choices_nums += [1.0] * ( len(activated_loras) - len(list_mult_choices_nums ) )
loras_selected = [ lora for lora in loras if os.path.basename(lora) in activated_loras]
pinnedLora = profile !=5 #False # # #
- offload.load_loras_into_model(trans, loras_selected, list_mult_choices_nums, activate_all_loras=True, preprocess_sd=preprocess_loras, pinnedLora=pinnedLora, split_linear_modules_map = None)
+ split_linear_modules_map = getattr(trans,"split_linear_modules_map", None)
+ offload.load_loras_into_model(trans, loras_selected, list_mult_choices_nums, activate_all_loras=True, preprocess_sd=get_loras_preprocessor(trans, model_filename), pinnedLora=pinnedLora, split_linear_modules_map = split_linear_modules_map)
errors = trans._loras_errors
if len(errors) > 0:
error_files = [msg for _ , msg in errors]
@@ -2527,14 +2780,30 @@ def generate_video(
device_mem_capacity = torch.cuda.get_device_properties(None).total_memory / 1048576
diffusion_forcing = "diffusion_forcing" in model_filename
+ ltxv = "ltxv" in model_filename
vace = "Vace" in model_filename
- if diffusion_forcing:
+ phantom = "phantom" in model_filename
+ hunyuan_t2v = "hunyuan_video_720" in model_filename
+ hunyuan_i2v = "hunyuan_video_i2v" in model_filename
+ hunyuan_custom = "hunyuan_video_custom" in model_filename
+ if diffusion_forcing or hunyuan_t2v or hunyuan_i2v:
fps = 24
elif audio_guide != None:
fps = 23
+ elif ltxv:
+ fps = 30
else:
fps = 16
+ original_image_refs = image_refs
+ if image_refs != None and len(image_refs) > 0 and (hunyuan_custom or phantom or vace):
+ send_cmd("progress", [0, get_latest_status(state) + " - Removing Images References Background"])
+ os.environ["U2NET_HOME"] = os.path.join(os.getcwd(), "ckpts", "rembg")
+ from wan.utils.utils import resize_and_remove_background
+ image_refs = resize_and_remove_background(image_refs, width, height, remove_background_image_ref ==1, fit_into_canvas= not vace)
+ update_task_thumbnails(task, locals())
+ send_cmd("output")
+
joint_pass = boost ==1 #and profile != 1 and profile != 3
# TeaCache
trans.enable_teacache = tea_cache_setting > 0
@@ -2542,19 +2811,19 @@ def generate_video(
trans.teacache_multiplier = tea_cache_setting
trans.rel_l1_thresh = 0
trans.teacache_start_step = int(tea_cache_start_step_perc*num_inference_steps/100)
-
- if image2video:
- if '720p' in model_filename:
- trans.coefficients = [-114.36346466, 65.26524496, -18.82220707, 4.91518089, -0.23412683]
- else:
- trans.coefficients = [-3.02331670e+02, 2.23948934e+02, -5.25463970e+01, 5.87348440e+00, -2.01973289e-01]
- else:
- if '1.3B' in model_filename:
- trans.coefficients = [2.39676752e+03, -1.31110545e+03, 2.01331979e+02, -8.29855975e+00, 1.37887774e-01]
- elif '14B' in model_filename:
- trans.coefficients = [-5784.54975374, 5449.50911966, -1811.16591783, 256.27178429, -13.02252404]
+ if get_model_family(model_filename) == "wan":
+ if image2video:
+ if '720p' in model_filename:
+ trans.coefficients = [-114.36346466, 65.26524496, -18.82220707, 4.91518089, -0.23412683]
+ else:
+ trans.coefficients = [-3.02331670e+02, 2.23948934e+02, -5.25463970e+01, 5.87348440e+00, -2.01973289e-01]
else:
- raise gr.Error("Teacache not supported for this model")
+ if '1.3B' in model_filename:
+ trans.coefficients = [2.39676752e+03, -1.31110545e+03, 2.01331979e+02, -8.29855975e+00, 1.37887774e-01]
+ elif '14B' in model_filename:
+ trans.coefficients = [-5784.54975374, 5449.50911966, -1811.16591783, 256.27178429, -13.02252404]
+ else:
+ raise gr.Error("Teacache not supported for this model")
source_video = None
target_camera = None
if "recam" in model_filename:
@@ -2589,27 +2858,24 @@ def generate_video(
extra_generation = 0
initial_total_windows = 0
max_frames_to_generate = video_length
- phantom = "phantom" in model_filename
- if diffusion_forcing or vace:
+ if diffusion_forcing or vace or ltxv:
reuse_frames = min(sliding_window_size - 4, sliding_window_overlap)
else:
reuse_frames = 0
- if diffusion_forcing and source_video != None:
+ if (diffusion_forcing or ltxv) and source_video != None:
video_length += sliding_window_overlap
- sliding_window = ("Vace" in model_filename or diffusion_forcing) and video_length > sliding_window_size
+ sliding_window = (vace or diffusion_forcing or ltxv) and video_length > sliding_window_size
if sliding_window:
discard_last_frames = sliding_window_discard_last_frames
left_after_first_window = video_length - sliding_window_size + discard_last_frames
initial_total_windows= 1 + math.ceil(left_after_first_window / (sliding_window_size - discard_last_frames - reuse_frames))
video_length = sliding_window_size
- prompts = prompt.split("\n")
- prompts = [part for part in prompts if len(prompt)>0]
else:
initial_total_windows = 1
first_window_video_length = video_length
-
+ original_prompts = prompts.copy()
gen["sliding_window"] = sliding_window
while not abort:
extra_generation += gen.get("extra_orders",0)
@@ -2631,6 +2897,37 @@ def generate_video(
video_length = first_window_video_length
gen["extra_windows"] = 0
start_time = time.time()
+ if prompt_enhancer_image_caption_model != None and prompt_enhancer !=None and len(prompt_enhancer)>0:
+ text_encoder_max_tokens = 256
+ send_cmd("progress", [0, get_latest_status(state) + " - Enhancing Prompt"])
+ from ltx_video.utils.prompt_enhance_utils import generate_cinematic_prompt
+ prompt_images = []
+ if "I" in prompt_enhancer:
+ if image_start != None:
+ prompt_images.append(image_start)
+ if original_image_refs != None:
+ prompt_images += original_image_refs[:1]
+ if len(original_prompts) == 0 and not "T" in prompt_enhancer:
+ pass
+ else:
+ from wan.utils.utils import seed_everything
+ seed_everything(seed)
+ # for i, original_prompt in enumerate(original_prompts):
+ prompts = generate_cinematic_prompt(
+ prompt_enhancer_image_caption_model,
+ prompt_enhancer_image_caption_processor,
+ prompt_enhancer_llm_model,
+ prompt_enhancer_llm_tokenizer,
+ original_prompts if "T" in prompt_enhancer else ["an image"],
+ prompt_images if len(prompt_images) > 0 else None,
+ max_new_tokens=text_encoder_max_tokens,
+ )
+ print(f"Enhanced prompts: {prompts}" )
+ task["prompt"] = "\n".join(["!enhanced!"] + prompts)
+ send_cmd("output")
+ prompt = prompts[0]
+ abort = gen.get("abort", False)
+
while not abort:
if sliding_window:
prompt = prompts[window_no] if window_no < len(prompts) else prompts[-1]
@@ -2645,12 +2942,14 @@ def generate_video(
window_no += 1
gen["window_no"] = window_no
- if phantom:
+ if hunyuan_custom:
+ src_ref_images = image_refs
+ elif phantom:
src_ref_images = image_refs.copy() if image_refs != None else None
- elif diffusion_forcing:
+ elif diffusion_forcing or ltxv:
if video_source != None and len(video_source) > 0 and window_no == 1:
keep_frames_video_source= 1000 if len(keep_frames_video_source) ==0 else int(keep_frames_video_source)
- prefix_video = preprocess_video(None, width=width, height=height,video_in=video_source, max_frames= keep_frames_video_source , start_frame = 0, fit_canvas= fit_canvas, target_fps = fps)
+ prefix_video = preprocess_video(None, width=width, height=height,video_in=video_source, max_frames= keep_frames_video_source , start_frame = 0, fit_canvas= fit_canvas, target_fps = fps, block_size = 32 if ltxv else 16)
prefix_video = prefix_video .permute(3, 0, 1, 2)
prefix_video = prefix_video .float().div_(127.5).sub_(1.) # c, f, h, w
prefix_video_frames_count = prefix_video.shape[1]
@@ -2717,93 +3016,48 @@ def generate_video(
trans.num_steps = num_inference_steps
trans.teacache_skipped_steps = 0
trans.previous_residual = None
-
- if image2video:
- samples = wan_model.generate(
- prompt,
- image_start,
- image_end if image_end != None else None,
- frame_num=(video_length // 4)* 4 + 1,
- # max_area=MAX_AREA_CONFIGS[resolution_reformated],
- height = height,
- width = width,
- fit_into_canvas = fit_canvas,
- shift=flow_shift,
- sampling_steps=num_inference_steps,
- guide_scale=guidance_scale,
- n_prompt=negative_prompt,
- seed=seed,
- callback=callback,
- enable_RIFLEx = enable_RIFLEx,
- VAE_tile_size = VAE_tile_size,
- joint_pass = joint_pass,
- slg_layers = slg_layers,
- slg_start = slg_start_perc/100,
- slg_end = slg_end_perc/100,
- cfg_star_switch = cfg_star_switch,
- cfg_zero_step = cfg_zero_step,
- add_frames_for_end_image = "image2video" in model_filename,
- audio_cfg_scale= audio_guidance_scale,
- audio_proj= audio_proj_split,
- audio_scale= audio_scale,
- audio_context_lens= audio_context_lens
- )
- elif diffusion_forcing:
- samples = wan_model.generate(
- prompt = prompt,
- negative_prompt = negative_prompt,
- image = image_start,
- input_video= pre_video_guide,
- height = height,
- width = width,
- fit_into_canvas = fit_canvas,
- seed = seed,
- num_frames = (video_length // 4)* 4 + 1, #377
- num_inference_steps = num_inference_steps,
- shift = flow_shift,
- guidance_scale= guidance_scale,
- callback= callback,
- VAE_tile_size = VAE_tile_size,
- joint_pass = joint_pass,
- slg_layers = slg_layers,
- slg_start = slg_start_perc/100,
- slg_end = slg_end_perc/100,
- addnoise_condition = sliding_window_overlap_noise,
- ar_step = model_mode, #5
- causal_block_size = 5,
- causal_attention = True,
- fps = fps,
- )
- else:
- samples = wan_model.generate(
- prompt,
- input_frames = src_video,
- input_ref_images= src_ref_images,
- input_masks = src_mask,
- source_video= source_video,
- target_camera= target_camera,
- frame_num=(video_length // 4)* 4 + 1,
- size=(width, height),
- fit_into_canvas = fit_canvas,
- shift=flow_shift,
- sampling_steps=num_inference_steps,
- guide_scale=guidance_scale,
- n_prompt=negative_prompt,
- seed=seed,
- offload_model=False,
- callback=callback,
- enable_RIFLEx = enable_RIFLEx,
- VAE_tile_size = VAE_tile_size,
- joint_pass = joint_pass,
- slg_layers = slg_layers,
- slg_start = slg_start_perc/100,
- slg_end = slg_end_perc/100,
- cfg_star_switch = cfg_star_switch,
- cfg_zero_step = cfg_zero_step,
- overlapped_latents = 0 if reuse_frames == 0 or window_no == 1 else ((reuse_frames - 1) // 4 + 1),
- overlap_noise = sliding_window_overlap_noise,
- vace = vace
- )
+ trans.previous_modulated_input = None
+
+ samples = wan_model.generate(
+ input_prompt = prompt,
+ image_start = image_start,
+ image_end = image_end if image_end != None else None,
+ input_frames = src_video,
+ input_ref_images= src_ref_images,
+ input_masks = src_mask,
+ input_video= pre_video_guide if diffusion_forcing or ltxv else source_video,
+ target_camera= target_camera,
+ frame_num=(video_length // 4)* 4 + 1,
+ height = height,
+ width = width,
+ fit_into_canvas = fit_canvas,
+ shift=flow_shift,
+ sampling_steps=num_inference_steps,
+ guide_scale=guidance_scale,
+ embedded_guidance_scale=embedded_guidance_scale,
+ n_prompt=negative_prompt,
+ seed=seed,
+ callback=callback,
+ enable_RIFLEx = enable_RIFLEx,
+ VAE_tile_size = VAE_tile_size,
+ joint_pass = joint_pass,
+ slg_layers = slg_layers,
+ slg_start = slg_start_perc/100,
+ slg_end = slg_end_perc/100,
+ cfg_star_switch = cfg_star_switch,
+ cfg_zero_step = cfg_zero_step,
+ audio_cfg_scale= audio_guidance_scale,
+ audio_proj= audio_proj_split,
+ audio_scale= audio_scale,
+ audio_context_lens= audio_context_lens,
+ ar_step = model_mode, #5
+ causal_block_size = 5,
+ causal_attention = True,
+ fps = fps,
+ overlapped_latents = 0 if reuse_frames == 0 or window_no == 1 else ((reuse_frames - 1) // 4 + 1),
+ overlap_noise = sliding_window_overlap_noise,
+ model_filename = model_filename,
+ )
except Exception as e:
if temp_filename!= None and os.path.isfile(temp_filename):
os.remove(temp_filename)
@@ -2836,6 +3090,7 @@ def generate_video(
return
finally:
trans.previous_residual = None
+ trans.previous_modulated_input = None
if trans.enable_teacache:
print(f"Teacache Skipped Steps:{trans.teacache_skipped_steps}/{trans.num_steps}" )
@@ -2954,7 +3209,8 @@ def generate_video(
inputs = get_function_arguments(generate_video, locals())
inputs.pop("send_cmd")
- inputs.pop("task_id")
+ inputs.pop("task")
+ inputs["prompt"] = "\n".join(prompts)
configs = prepare_inputs_dict("metadata", inputs)
configs["generation_time"] = round(end_time-start_time)
metadata_choice = server_config.get("metadata_type","metadata")
@@ -2992,33 +3248,197 @@ def prepare_generate_video(state):
def generate_preview(latents):
import einops
- latent_channels = 16
- latent_dimensions = 3
- latents = latents.unsqueeze(0)
- latent_rgb_factors = [
- [-0.1299, -0.1692, 0.2932],
- [ 0.0671, 0.0406, 0.0442],
- [ 0.3568, 0.2548, 0.1747],
- [ 0.0372, 0.2344, 0.1420],
- [ 0.0313, 0.0189, -0.0328],
- [ 0.0296, -0.0956, -0.0665],
- [-0.3477, -0.4059, -0.2925],
- [ 0.0166, 0.1902, 0.1975],
- [-0.0412, 0.0267, -0.1364],
- [-0.1293, 0.0740, 0.1636],
- [ 0.0680, 0.3019, 0.1128],
- [ 0.0032, 0.0581, 0.0639],
- [-0.1251, 0.0927, 0.1699],
- [ 0.0060, -0.0633, 0.0005],
- [ 0.3477, 0.2275, 0.2950],
- [ 0.1984, 0.0913, 0.1861]
- ]
+ model_family = get_model_family(transformer_filename)
+ if model_family == "wan":
+ latent_channels = 16
+ latent_dimensions = 3
+ latent_rgb_factors = [
+ [-0.1299, -0.1692, 0.2932],
+ [ 0.0671, 0.0406, 0.0442],
+ [ 0.3568, 0.2548, 0.1747],
+ [ 0.0372, 0.2344, 0.1420],
+ [ 0.0313, 0.0189, -0.0328],
+ [ 0.0296, -0.0956, -0.0665],
+ [-0.3477, -0.4059, -0.2925],
+ [ 0.0166, 0.1902, 0.1975],
+ [-0.0412, 0.0267, -0.1364],
+ [-0.1293, 0.0740, 0.1636],
+ [ 0.0680, 0.3019, 0.1128],
+ [ 0.0032, 0.0581, 0.0639],
+ [-0.1251, 0.0927, 0.1699],
+ [ 0.0060, -0.0633, 0.0005],
+ [ 0.3477, 0.2275, 0.2950],
+ [ 0.1984, 0.0913, 0.1861]
+ ]
- # credits for the rgb factors to ComfyUI ?
-
- latent_rgb_factors_bias = [-0.1835, -0.0868, -0.3360]
+ # credits for the rgb factors to ComfyUI ?
+
+ latent_rgb_factors_bias = [-0.1835, -0.0868, -0.3360]
+
+ # latent_rgb_factors_bias = [0.0259, -0.0192, -0.0761]
+ elif model_family == "ltxv":
+ latent_channels = 128
+ latent_dimensions = 3
+
+ latent_rgb_factors = [
+ [ 1.1202e-02, -6.3815e-04, -1.0021e-02],
+ [ 8.6031e-02, 6.5813e-02, 9.5409e-04],
+ [-1.2576e-02, -7.5734e-03, -4.0528e-03],
+ [ 9.4063e-03, -2.1688e-03, 2.6093e-03],
+ [ 3.7636e-03, 1.2765e-02, 9.1548e-03],
+ [ 2.1024e-02, -5.2973e-03, 3.4373e-03],
+ [-8.8896e-03, -1.9703e-02, -1.8761e-02],
+ [-1.3160e-02, -1.0523e-02, 1.9709e-03],
+ [-1.5152e-03, -6.9891e-03, -7.5810e-03],
+ [-1.7247e-03, 4.6560e-04, -3.3839e-03],
+ [ 1.3617e-02, 4.7077e-03, -2.0045e-03],
+ [ 1.0256e-02, 7.7318e-03, 1.3948e-02],
+ [-1.6108e-02, -6.2151e-03, 1.1561e-03],
+ [ 7.3407e-03, 1.5628e-02, 4.4865e-04],
+ [ 9.5357e-04, -2.9518e-03, -1.4760e-02],
+ [ 1.9143e-02, 1.0868e-02, 1.2264e-02],
+ [ 4.4575e-03, 3.6682e-05, -6.8508e-03],
+ [-4.5681e-04, 3.2570e-03, 7.7929e-03],
+ [ 3.3902e-02, 3.3405e-02, 3.7454e-02],
+ [-2.3001e-02, -2.4877e-03, -3.1033e-03],
+ [ 5.0265e-02, 3.8841e-02, 3.3539e-02],
+ [-4.1018e-03, -1.1095e-03, 1.5859e-03],
+ [-1.2689e-01, -1.3107e-01, -2.1005e-01],
+ [ 2.6276e-02, 1.4189e-02, -3.5963e-03],
+ [-4.8679e-03, 8.8486e-03, 7.8029e-03],
+ [-1.6610e-03, -4.8597e-03, -5.2060e-03],
+ [-2.1010e-03, 2.3610e-03, 9.3796e-03],
+ [-2.2482e-02, -2.1305e-02, -1.5087e-02],
+ [-1.5753e-02, -1.0646e-02, -6.5083e-03],
+ [-4.6975e-03, 5.0288e-03, -6.7390e-03],
+ [ 1.1951e-02, 2.0712e-02, 1.6191e-02],
+ [-6.3704e-03, -8.4827e-03, -9.5483e-03],
+ [ 7.2610e-03, -9.9326e-03, -2.2978e-02],
+ [-9.1904e-04, 6.2882e-03, 9.5720e-03],
+ [-3.7178e-02, -3.7123e-02, -5.6713e-02],
+ [-1.3373e-01, -1.0720e-01, -5.3801e-02],
+ [-5.3702e-03, 8.1256e-03, 8.8397e-03],
+ [-1.5247e-01, -2.1437e-01, -2.1843e-01],
+ [ 3.1441e-02, 7.0335e-03, -9.7541e-03],
+ [ 2.1528e-03, -8.9817e-03, -2.1023e-02],
+ [ 3.8461e-03, -5.8957e-03, -1.5014e-02],
+ [-4.3470e-03, -1.2940e-02, -1.5972e-02],
+ [-5.4781e-03, -1.0842e-02, -3.0204e-03],
+ [-6.5347e-03, 3.0806e-03, -1.0163e-02],
+ [-5.0414e-03, -7.1503e-03, -8.9686e-04],
+ [-8.5851e-03, -2.4351e-03, 1.0674e-03],
+ [-9.0016e-03, -9.6493e-03, 1.5692e-03],
+ [ 5.0914e-03, 1.2099e-02, 1.9968e-02],
+ [ 1.3758e-02, 1.1669e-02, 8.1958e-03],
+ [-1.0518e-02, -1.1575e-02, -4.1307e-03],
+ [-2.8410e-02, -3.1266e-02, -2.2149e-02],
+ [ 2.9336e-03, 3.6511e-02, 1.8717e-02],
+ [-1.6703e-02, -1.6696e-02, -4.4529e-03],
+ [ 4.8818e-02, 4.0063e-02, 8.7410e-03],
+ [-1.5066e-02, -5.7328e-04, 2.9785e-03],
+ [-1.7613e-02, -8.1034e-03, 1.3086e-02],
+ [-9.2633e-03, 1.0803e-02, -6.3489e-03],
+ [ 3.0851e-03, 4.7750e-04, 1.2347e-02],
+ [-2.2785e-02, -2.3043e-02, -2.6005e-02],
+ [-2.4787e-02, -1.5389e-02, -2.2104e-02],
+ [-2.3572e-02, 1.0544e-03, 1.2361e-02],
+ [-7.8915e-03, -1.2271e-03, -6.0968e-03],
+ [-1.1478e-02, -1.2543e-03, 6.2679e-03],
+ [-5.4229e-02, 2.6644e-02, 6.3394e-03],
+ [ 4.4216e-03, -7.3338e-03, -1.0464e-02],
+ [-4.5013e-03, 1.6082e-03, 1.4420e-02],
+ [ 1.3673e-02, 8.8877e-03, 4.1253e-03],
+ [-1.0145e-02, 9.0072e-03, 1.5695e-02],
+ [-5.6234e-03, 1.1847e-03, 8.1261e-03],
+ [-3.7171e-03, -5.3538e-03, 1.2590e-03],
+ [ 2.9476e-02, 2.1424e-02, 3.0424e-02],
+ [-3.4925e-02, -2.4340e-02, -2.5316e-02],
+ [-3.4127e-02, -2.2406e-02, -1.0589e-02],
+ [-1.7342e-02, -1.3249e-02, -1.0719e-02],
+ [-2.1478e-03, -8.6051e-03, -2.9878e-03],
+ [ 1.2089e-03, -4.2391e-03, -6.8569e-03],
+ [ 9.0411e-04, -6.6886e-03, -6.7547e-05],
+ [ 1.6048e-02, -1.0057e-02, -2.8929e-02],
+ [ 1.2290e-03, 1.0163e-02, 1.8861e-02],
+ [ 1.7264e-02, 2.7257e-04, 1.3785e-02],
+ [-1.3482e-02, -3.6427e-03, 6.7481e-04],
+ [ 4.6782e-03, -5.2423e-03, 2.4467e-03],
+ [-5.9113e-03, -6.2244e-03, -1.8162e-03],
+ [ 1.5496e-02, 1.4582e-02, 1.9514e-03],
+ [ 7.4958e-03, 1.5886e-03, -8.2305e-03],
+ [ 1.9086e-02, 1.6360e-03, -3.9674e-03],
+ [-5.7021e-03, -2.7307e-03, -4.1066e-03],
+ [ 1.7450e-03, 1.4602e-02, 2.5794e-02],
+ [-8.2788e-04, 2.2902e-03, 4.5161e-03],
+ [ 1.1632e-02, 8.9193e-03, -7.2813e-03],
+ [ 7.5721e-03, 2.6784e-03, 1.1393e-02],
+ [ 5.1939e-03, 3.6903e-03, 1.4049e-02],
+ [-1.8383e-02, -2.2529e-02, -2.4477e-02],
+ [ 5.8842e-04, -5.7874e-03, -1.4770e-02],
+ [-1.6125e-02, -8.6101e-03, -1.4533e-02],
+ [ 2.0540e-02, 2.0729e-02, 6.4338e-03],
+ [ 3.3587e-03, -1.1226e-02, -1.6444e-02],
+ [-1.4742e-03, -1.0489e-02, 1.7097e-03],
+ [ 2.8130e-02, 2.3546e-02, 3.2791e-02],
+ [-1.8532e-02, -1.2842e-02, -8.7756e-03],
+ [-8.0533e-03, -1.0771e-02, -1.7536e-02],
+ [-3.9009e-03, 1.6150e-02, 3.3359e-02],
+ [-7.4554e-03, -1.4154e-02, -6.1910e-03],
+ [ 3.4734e-03, -1.1370e-02, -1.0581e-02],
+ [ 1.1476e-02, 3.9281e-03, 2.8231e-03],
+ [ 7.1639e-03, -1.4741e-03, -3.8066e-03],
+ [ 2.2250e-03, -8.7552e-03, -9.5719e-03],
+ [ 2.4146e-02, 2.1696e-02, 2.8056e-02],
+ [-5.4365e-03, -2.4291e-02, -1.7802e-02],
+ [ 7.4263e-03, 1.0510e-02, 1.2705e-02],
+ [ 6.2669e-03, 6.2658e-03, 1.9211e-02],
+ [ 1.6378e-02, 9.4933e-03, 6.6971e-03],
+ [ 1.7173e-02, 2.3601e-02, 2.3296e-02],
+ [-1.4568e-02, -9.8279e-03, -1.1556e-02],
+ [ 1.4431e-02, 1.4430e-02, 6.6362e-03],
+ [-6.8230e-03, 1.8863e-02, 1.4555e-02],
+ [ 6.1156e-03, 3.4700e-03, -2.6662e-03],
+ [-2.6983e-03, -5.9402e-03, -9.2276e-03],
+ [ 1.0235e-02, 7.4173e-03, -7.6243e-03],
+ [-1.3255e-02, 1.9322e-02, -9.2153e-04],
+ [ 2.4222e-03, -4.8039e-03, -1.5759e-02],
+ [ 2.6244e-02, 2.5951e-02, 2.0249e-02],
+ [ 1.5711e-02, 1.8498e-02, 2.7407e-03],
+ [-2.1714e-03, 4.7214e-03, -2.2443e-02],
+ [-7.4747e-03, 7.4166e-03, 1.4430e-02],
+ [-8.3906e-03, -7.9776e-03, 9.7927e-03],
+ [ 3.8321e-02, 9.6622e-03, -1.9268e-02],
+ [-1.4605e-02, -6.7032e-03, 3.9675e-03]
+ ]
+ latent_rgb_factors_bias = [-0.0571, -0.1657, -0.2512]
+
+ elif model_family == "hunyuan":
+ latent_channels = 16
+ latent_dimensions = 3
+ scale_factor = 0.476986
+ latent_rgb_factors = [
+ [-0.0395, -0.0331, 0.0445],
+ [ 0.0696, 0.0795, 0.0518],
+ [ 0.0135, -0.0945, -0.0282],
+ [ 0.0108, -0.0250, -0.0765],
+ [-0.0209, 0.0032, 0.0224],
+ [-0.0804, -0.0254, -0.0639],
+ [-0.0991, 0.0271, -0.0669],
+ [-0.0646, -0.0422, -0.0400],
+ [-0.0696, -0.0595, -0.0894],
+ [-0.0799, -0.0208, -0.0375],
+ [ 0.1166, 0.1627, 0.0962],
+ [ 0.1165, 0.0432, 0.0407],
+ [-0.2315, -0.1920, -0.1355],
+ [-0.0270, 0.0401, -0.0821],
+ [-0.0616, -0.0997, -0.0727],
+ [ 0.0249, -0.0469, -0.1703]
+ ]
- latent_rgb_factors_bias = [0.0259, -0.0192, -0.0761]
+ latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761]
+ else:
+ raise Exception("preview not supported")
+ latents = latents.unsqueeze(0)
nb_latents = latents.shape[2]
latents_to_preview = 4
latents_to_preview = min(nb_latents, latents_to_preview)
@@ -3034,15 +3454,15 @@ def generate_preview(latents):
bias = torch.tensor(latent_rgb_factors_bias, device=latents.device, dtype=latents.dtype)
images = torch.nn.functional.conv3d(latents, weight, bias=bias, stride=1, padding=0, dilation=1, groups=1)
- images = images.clamp(0.0, 1.0)
-
-
- images = (images * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
+ images = images.add_(1.0).mul_(127.5)
+ images = images.detach().cpu().numpy().clip(0, 255).astype(np.uint8)
images = einops.rearrange(images, 'b c t h w -> (b h) (t w) c')
h, w, _ = images.shape
scale = 200 / h
images= Image.fromarray(images)
images = images.resize(( int(w*scale),int(h*scale)), resample=Image.Resampling.BILINEAR)
+ if images != None:
+ images.save("prepreview.png")
return images
@@ -3078,7 +3498,6 @@ def process_tasks(state):
global gen_in_progress
gen_in_progress = True
gen["in_progress"] = True
-
gen["preview"] = None
gen["status"] = "Generating Video"
yield time.time(), time.time()
@@ -3094,7 +3513,7 @@ def process_tasks(state):
send_cmd = com_stream.output_queue.push
def generate_video_error_handler():
try:
- generate_video(task_id, send_cmd, **params)
+ generate_video(task, send_cmd, **params)
except Exception as e:
tb = traceback.format_exc().split('\n')[:-1]
print('\n'.join(tb))
@@ -3127,6 +3546,7 @@ def process_tasks(state):
gen["progress_args"] = data
# progress(*data)
elif cmd == "preview":
+ torch.cuda.current_stream().synchronize()
preview= None if data== None else generate_preview(data)
gen["preview"] = preview
yield time.time() , gr.Text()
@@ -3339,7 +3759,7 @@ def refresh_lora_list(state, lset_name, loras_choices):
lset_name =""
if wan_model != None:
- errors = getattr(wan_model.model, "_loras_errors", "")
+ errors = getattr(get_transformer_model(wan_model), "_loras_errors", "")
if errors !=None and len(errors) > 0:
error_files = [path for path, _ in errors]
gr.Info("Error while refreshing Lora List, invalid Lora files: " + ", ".join(error_files))
@@ -3531,7 +3951,7 @@ def prepare_inputs_dict(target, inputs ):
inputs.pop(k)
model_filename = state["model_filename"]
- inputs["type"] = "Wan2.1GP by DeepBeepMeep - " + get_model_name(model_filename)
+ inputs["type"] = "WanGP by DeepBeepMeep - " + get_model_name(model_filename)
if target == "settings":
return inputs
@@ -3539,21 +3959,23 @@ def prepare_inputs_dict(target, inputs ):
if not test_class_i2v(model_filename):
inputs.pop("image_prompt_type")
+ if not server_config.get("enhancer_enabled", 0) == 1:
+ inputs.pop("prompt_enhancer")
if not "recam" in model_filename or not "diffusion_forcing" in model_filename:
inputs.pop("model_mode")
- if not "Vace" in model_filename or not "phantom" in model_filename:
+ if not "Vace" in model_filename or not "phantom" in model_filename or not "hunyuan_video_custom" in model_filename:
unsaved_params = ["keep_frames_video_guide", "video_prompt_type", "remove_background_image_ref"]
for k in unsaved_params:
inputs.pop(k)
- if not "diffusion_forcing" in model_filename:
+ if not ("diffusion_forcing" in model_filename or "ltxv" in model_filename):
unsaved_params = ["keep_frames_video_source"]
for k in unsaved_params:
inputs.pop(k)
- if not "Vace" in model_filename or "diffusion_forcing" in model_filename:
+ if not "Vace" in model_filename or "diffusion_forcing" in model_filename or "ltxv" in model_filename:
unsaved_params = [ "sliding_window_size", "sliding_window_overlap", "sliding_window_overlap_noise", "sliding_window_discard_last_frames"]
for k in unsaved_params:
inputs.pop(k)
@@ -3620,6 +4042,7 @@ def save_inputs(
slg_end_perc,
cfg_star_switch,
cfg_zero_step,
+ prompt_enhancer,
state,
):
@@ -3643,7 +4066,7 @@ def save_inputs(
def download_loras():
from huggingface_hub import snapshot_download
yield gr.Row(visible=True), "Please wait while the Loras are being downloaded", *[gr.Column(visible=False)] * 2
- lora_dir = get_lora_dir(get_model_filename("i2v", transformer_quantization))
+ lora_dir = get_lora_dir(get_model_filename("i2v"))
log_path = os.path.join(lora_dir, "log.txt")
if not os.path.isfile(log_path):
tmp_path = os.path.join(lora_dir, "tmp_lora_dowload")
@@ -3720,7 +4143,7 @@ def handle_celll_selection(state, evt: gr.SelectData):
def change_model(state, model_choice):
if model_choice == None:
return
- model_filename = get_model_filename(model_choice, transformer_quantization)
+ model_filename = get_model_filename(model_choice, transformer_quantization, transformer_dtype_policy)
state["model_filename"] = model_filename
header = generate_header(model_filename, compile=compile, attention_mode=attention_mode)
return header
@@ -3908,29 +4331,47 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
state = gr.State(state_dict)
trigger_refresh_input_type = gr.Text(interactive= False, visible= False)
diffusion_forcing = "diffusion_forcing" in model_filename
+ ltxv = "ltxv" in model_filename
recammaster = "recam" in model_filename
vace = "Vace" in model_filename
phantom = "phantom" in model_filename
fantasy = "fantasy" in model_filename
- with gr.Column(visible= test_class_i2v(model_filename) or diffusion_forcing or recammaster) as image_prompt_column:
- if diffusion_forcing:
+ hunyuan_t2v = "hunyuan_video_720" in model_filename
+ hunyuan_i2v = "hunyuan_video_i2v" in model_filename
+ hunyuan_video_custom = "hunyuan_video_custom" in model_filename
+
+
+
+ with gr.Column(visible= test_class_i2v(model_filename) or diffusion_forcing or ltxv or recammaster) as image_prompt_column:
+ if diffusion_forcing or ltxv:
image_prompt_type_value= ui_defaults.get("image_prompt_type","S")
- image_prompt_type = gr.Radio( [("Start Video with Image", "S"),("Continue Video", "V"),("Text Prompt Only", "T")], value =image_prompt_type_value, label="Location", show_label= False, visible= True, scale= 3)
+ # image_prompt_type = gr.Radio( [("Start Video with Image", "S"),("Start and End Video with Images", "SE"), ("Continue Video", "V"),("Text Prompt Only", "T")], value =image_prompt_type_value, label="Location", show_label= False, visible= True, scale= 3)
+ image_prompt_type = gr.Radio( [("Start Video with Image", "S"),("Continue Video", "V"),("Text Prompt Only", "T")], value =image_prompt_type_value, label="Location", show_label= False, visible= True , scale= 3)
+
# image_start = gr.Image(label= "Image as a starting point for a new video", type ="pil",value= ui_defaults.get("image_start", None), visible= "S" in image_prompt_type_value )
image_start = gr.Gallery(
label="Images as starting points for new videos", type ="pil", #file_types= "image",
columns=[3], rows=[1], object_fit="contain", height="auto", selected_index=0, interactive= True, value= ui_defaults.get("image_start", None), visible= "S" in image_prompt_type_value)
- image_end = gr.Gallery(visible=False)
+ image_end = gr.Gallery(
+ label="Images as ending points for new videos", type ="pil", #file_types= "image",
+ columns=[3], rows=[1], object_fit="contain", height="auto", selected_index=0, interactive= True, visible="E" in image_prompt_type_value, value= ui_defaults.get("image_end", None))
video_source = gr.Video(label= "Video to Continue", visible= "V" in image_prompt_type_value, value= ui_defaults.get("video_source", None),)
- model_mode = gr.Dropdown(
- choices=[
- ("Synchronous", 0),
- ("Asynchronous (better quality but around 50% extra steps added)", 5),
- ],
- value=ui_defaults.get("model_mode", 0),
- label="Generation Type", scale = 3,
- visible= True
- )
+ if ltxv:
+ model_mode = gr.Dropdown(
+ choices=[
+ ],
+ visible= False
+ )
+ else:
+ model_mode = gr.Dropdown(
+ choices=[
+ ("Synchronous", 0),
+ ("Asynchronous (better quality but around 50% extra steps added)", 5),
+ ],
+ value=ui_defaults.get("model_mode", 0),
+ label="Generation Type", scale = 3,
+ visible= True
+ )
keep_frames_video_source = gr.Text(value=ui_defaults.get("keep_frames_video_source","") , visible= "V" in image_prompt_type_value, scale = 2, label= "Truncate Video beyond this number of Frames of Video (empty=Keep All)" )
elif recammaster:
image_prompt_type = gr.Radio(visible= False)
@@ -3957,7 +4398,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
keep_frames_video_source = gr.Text(visible=False)
else:
image_prompt_type_value= ui_defaults.get("image_prompt_type","S")
- image_prompt_type = gr.Radio( [("Use only a Start Image", "S"),("Use both a Start and an End Image", "SE")], value =image_prompt_type_value, label="Location", show_label= False, visible= True, scale= 3)
+ image_prompt_type = gr.Radio( [("Use only a Start Image", "S"),("Use both a Start and an End Image", "SE")], value =image_prompt_type_value, label="Location", show_label= False, visible= not hunyuan_i2v, scale= 3)
image_start = gr.Gallery(
label="Images as starting points for new videos", type ="pil", #file_types= "image",
@@ -3971,7 +4412,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
model_mode = gr.Dropdown(visible=False)
keep_frames_video_source = gr.Text(visible=False)
- with gr.Column(visible= vace or phantom) as video_prompt_column:
+ with gr.Column(visible= vace or phantom or hunyuan_video_custom) as video_prompt_column:
video_prompt_type_value= ui_defaults.get("video_prompt_type","")
video_prompt_type = gr.Text(value= video_prompt_type_value, visible= False)
with gr.Row():
@@ -3984,7 +4425,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
("Recolorize the Control Video", "CV"),
# ("Alternate Video Ending", "OV"),
("Video contains Open Pose, Depth, Black & White, Inpainting ", "V"),
- ("Control Video and Mask video for stronger Inpainting ", "MV"),
+ ("Control Video and Mask video for Inpainting ", "MV"),
],
value=filter_letters(video_prompt_type_value, "ODPCMV"),
label="Video to Video", scale = 3, visible= True
@@ -4049,6 +4490,18 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
wizard_prompt = gr.Textbox(visible = not advanced_prompt, label="Prompts (each new line of prompt will generate a new video, # lines = comments)", value=default_wizard_prompt, lines=3)
wizard_prompt_activated_var = gr.Text(wizard_prompt_activated, visible= False)
wizard_variables_var = gr.Text(wizard_variables, visible = False)
+ with gr.Row(visible= server_config.get("enhancer_enabled", 0) == 1 ) as prompt_enhancer_row:
+ prompt_enhancer = gr.Dropdown(
+ choices=[
+ ("Disabled", ""),
+ ("Based on Text Prompts", "T"),
+ ("Based on Image Prompts (such as Start Image and Reference Images)", "I"),
+ ("Based on both Text Prompts and Image Prompts", "TI"),
+ ],
+ value=ui_defaults.get("prompt_enhancer", ""),
+ label="Enhance Prompt using a LLM", scale = 3,
+ visible= True
+ )
with gr.Row():
if test_class_i2v(model_filename):
if server_config.get("fit_canvas", 0) == 1:
@@ -4062,13 +4515,13 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
# 720p
("1280x720 (16:9, 720p)", "1280x720"),
("720x1280 (9:16, 720p)", "720x1280"),
- ("1024x1024 (4:3, 720p)", "1024x024"),
+ ("1024x1024 (1:1, 720p)", "1024x024"),
("832x1104 (3:4, 720p)", "832x1104"),
- ("1104x832 (3:4, 720p)", "1104x832"),
+ ("1104x832 (4:3, 720p)", "1104x832"),
("960x960 (1:1, 720p)", "960x960"),
# 480p
("960x544 (16:9, 540p)", "960x544"),
- ("544x960 (16:9, 540p)", "544x960"),
+ ("544x960 (9:16, 540p)", "544x960"),
("832x480 (16:9, 480p)", "832x480"),
("480x832 (9:16, 480p)", "480x832"),
("832x624 (4:3, 480p)", "832x624"),
@@ -4084,10 +4537,14 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
video_length = gr.Slider(5, 193, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (16 = 1s), locked", interactive= False)
elif diffusion_forcing:
video_length = gr.Slider(17, 737, value=ui_defaults.get("video_length", 97), step=20, label="Number of frames (24 = 1s)", interactive= True)
+ elif ltxv:
+ video_length = gr.Slider(17, 737, value=ui_defaults.get("video_length", 97), step=8, label="Number of frames (30 = 1s)", interactive= True)
elif vace:
video_length = gr.Slider(17, 737, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (16 = 1s)", interactive= True)
elif fantasy:
video_length = gr.Slider(5, 233, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (23 = 1s)", interactive= True)
+ elif hunyuan_t2v or hunyuan_i2v:
+ video_length = gr.Slider(5, 337, value=ui_defaults.get("video_length", 97), step=4, label="Number of frames (24 = 1s)", interactive= True)
else:
video_length = gr.Slider(5, 193, value=ui_defaults.get("video_length", 81), step=4, label="Number of frames (16 = 1s)", interactive= True)
with gr.Row():
@@ -4109,10 +4566,10 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
("Match images and text prompts", 1),
], visible= test_class_i2v(model_filename), label= "Multiple Images as Texts Prompts"
)
- with gr.Row():
- guidance_scale = gr.Slider(1.0, 20.0, value=ui_defaults.get("guidance_scale",5), step=0.5, label="Guidance Scale", visible=True)
+ with gr.Row(visible = not ltxv):
+ guidance_scale = gr.Slider(1.0, 20.0, value=ui_defaults.get("guidance_scale",5), step=0.5, label="Guidance Scale", visible=not (hunyuan_t2v or hunyuan_i2v))
audio_guidance_scale = gr.Slider(1.0, 20.0, value=ui_defaults.get("audio_guidance_scale",5), step=0.5, label="Audio Guidance", visible=fantasy)
- embedded_guidance_scale = gr.Slider(1.0, 20.0, value=6.0, step=0.5, label="Embedded Guidance Scale", visible=False)
+ embedded_guidance_scale = gr.Slider(1.0, 20.0, value=6.0, step=0.5, label="Embedded Guidance Scale", visible=(hunyuan_t2v or hunyuan_i2v))
flow_shift = gr.Slider(0.0, 25.0, value=ui_defaults.get("flow_shift",3), step=0.1, label="Shift Scale")
with gr.Row():
negative_prompt = gr.Textbox(label="Negative Prompt", value=ui_defaults.get("negative_prompt", "") )
@@ -4128,11 +4585,9 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
label="Activated Loras"
)
loras_multipliers = gr.Textbox(label="Loras Multipliers (1.0 by default) separated by space characters or carriage returns, line that starts with # are ignored", value=launch_multis_str)
- with gr.Row():
- gr.Markdown("Tea Cache accelerates by skipping intelligently some steps, the more steps are skipped the lower the quality of the video (Tea Cache consumes also VRAM)")
- with gr.Tab("Speed"):
+ with gr.Tab("Speed", visible = not ltxv) as speed_tab:
with gr.Column():
- gr.Markdown("Tea Cache accelerates the Video generation by skipping denoising steps. This may impact the quality")
+ gr.Markdown("Tea Cache accelerates by skipping intelligently some steps, the more steps are skipped the lower the quality of the video (Tea Cache consumes also VRAM)")
tea_cache_setting = gr.Dropdown(
choices=[
@@ -4176,7 +4631,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
label="Spatial Upsampling"
)
- with gr.Tab("Quality"):
+ with gr.Tab("Quality", visible = not ltxv) as quality_tab:
with gr.Row():
gr.Markdown("Skip Layer Guidance (improves video quality)")
with gr.Row():
@@ -4219,7 +4674,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
with gr.Row():
cfg_zero_step = gr.Slider(-1, 39, value=ui_defaults.get("cfg_zero_step",-1), step=1, label="CFG Zero below this Layer (Extra Process)")
- with gr.Tab("Sliding Window", visible= "Vace" in model_filename or "diffusion_forcing" in model_filename) as sliding_window_tab:
+ with gr.Tab("Sliding Window", visible= vace or diffusion_forcing or ltxv) as sliding_window_tab:
with gr.Column():
gr.Markdown("A Sliding Window allows you to generate video with a duration not limited by the Model")
@@ -4229,6 +4684,11 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
sliding_window_overlap = gr.Slider(17, 97, value=ui_defaults.get("sliding_window_overlap",17), step=20, label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)")
sliding_window_overlap_noise = gr.Slider(0, 100, value=ui_defaults.get("sliding_window_overlap_noise",20), step=1, label="Noise to be added to overlapped frames to reduce blur effect")
sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, visible = False)
+ elif ltxv:
+ sliding_window_size = gr.Slider(41, 257, value=ui_defaults.get("sliding_window_size", 129), step=8, label="Sliding Window Size")
+ sliding_window_overlap = gr.Slider(9, 97, value=ui_defaults.get("sliding_window_overlap",17), step=8, label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)")
+ sliding_window_overlap_noise = gr.Slider(0, 100, value=ui_defaults.get("sliding_window_overlap_noise",20), step=1, label="Noise to be added to overlapped frames to reduce blur effect")
+ sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 0), step=4, visible = False)
else:
sliding_window_size = gr.Slider(5, 137, value=ui_defaults.get("sliding_window_size", 81), step=4, label="Sliding Window Size")
sliding_window_overlap = gr.Slider(1, 97, value=ui_defaults.get("sliding_window_overlap",5), step=4, label="Windows Frames Overlap (needed to maintain continuity between windows, a higher value will require more windows)")
@@ -4236,7 +4696,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
sliding_window_discard_last_frames = gr.Slider(0, 20, value=ui_defaults.get("sliding_window_discard_last_frames", 8), step=4, label="Discard Last Frames of a Window (that may have bad quality)", visible = True)
- with gr.Tab("Miscellaneous", visible= not "recam" in model_filename):
+ with gr.Tab("Miscellaneous", visible= not (recammaster or ltxv or diffusion_forcing)) as misc_tab:
gr.Markdown("With Riflex you can generate videos longer than 5s which is the default duration of videos used to train the model")
RIFLEx_setting = gr.Dropdown(
choices=[
@@ -4300,7 +4760,8 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
single_hidden_trigger_btn = gr.Button("trigger_countdown", visible=False, elem_id="trigger_info_single_btn")
extra_inputs = prompt_vars + [wizard_prompt, wizard_variables_var, wizard_prompt_activated_var, video_prompt_column, image_prompt_column,
- prompt_column_advanced, prompt_column_wizard_vars, prompt_column_wizard, lset_name, advanced_row, sliding_window_tab,
+ prompt_column_advanced, prompt_column_wizard_vars, prompt_column_wizard, lset_name, advanced_row, speed_tab, quality_tab,
+ sliding_window_tab, misc_tab, prompt_enhancer_row,
video_prompt_type_video_guide, video_prompt_type_image_refs] # show_advanced presets_column,
if update_form:
locals_dict = locals()
@@ -4537,7 +4998,7 @@ def generate_video_tab(update_form = False, state_dict = None, ui_defaults = Non
)
return ( state, loras_choices, lset_name, state,
- video_guide, video_mask, video_prompt_video_guide_trigger
+ video_guide, video_mask, video_prompt_video_guide_trigger, prompt_enhancer
)
@@ -4555,7 +5016,7 @@ def generate_download_tab(lset_name,loras_choices, state):
download_loras_btn.click(fn=download_loras, inputs=[], outputs=[download_status_row, download_status]).then(fn=refresh_lora_list, inputs=[state, lset_name,loras_choices], outputs=[lset_name, loras_choices])
-def generate_configuration_tab(state, blocks, header, model_choice):
+def generate_configuration_tab(state, blocks, header, model_choice, prompt_enhancer_row):
gr.Markdown("Please click Apply Changes at the bottom so that the changes are effective. Some choices below may be locked if the app has been launched by specifying a config preset.")
with gr.Column():
model_list = []
@@ -4565,7 +5026,7 @@ def generate_configuration_tab(state, blocks, header, model_choice):
# with gr.Row(visible=advanced_ui) as advanced_row:
with gr.Tab("General"):
for model_type in model_types:
- choice = get_model_filename(model_type, transformer_quantization)
+ choice = get_model_filename(model_type, transformer_quantization, transformer_dtype_policy)
model_list.append(choice)
dropdown_choices = [ ( get_model_name(choice), get_model_type(choice) ) for choice in model_list]
transformer_types_choices = gr.Dropdown(
@@ -4636,12 +5097,21 @@ def generate_configuration_tab(state, blocks, header, model_choice):
label="Keep Previously Generated Videos when starting a new Generation Batch"
)
+ enhancer_enabled_choice = gr.Dropdown(
+ choices=[
+ ("On", 1),
+ ("Off", 0),
+ ],
+ value=server_config.get("enhancer_enabled", 0),
+ label="Prompt Enhancer (if enabled, 8 GB of extra models will be downloaded)"
+ )
+
UI_theme_choice = gr.Dropdown(
choices=[
("Blue Sky", "default"),
("Classic Gradio", "gradio"),
],
- value=server_config.get("UI_theme_choice", "default"),
+ value=server_config.get("UI_theme", "default"),
label="User Interface Theme. You will need to restart the App the see new Theme."
)
@@ -4658,9 +5128,19 @@ def generate_configuration_tab(state, blocks, header, model_choice):
("16 bits (no quantization)", "bf16"),
],
value= transformer_quantization,
- label="Wan Transformer Model Quantization Type (if available)",
+ label="Transformer Model Quantization Type (if available)",
)
+ transformer_dtype_policy_choice = gr.Dropdown(
+ choices=[
+ ("Best Supported Data Type by Hardware", ""),
+ ("FP16", "fp16"),
+ ("BF16", "bf16"),
+ ],
+ value= server_config.get("transformer_dtype_policy", ""),
+ label="Transformer Data Type (if available)"
+ )
+
mixed_precision_choice = gr.Dropdown(
choices=[
("16 bits only, requires less VRAM", "0"),
@@ -4670,14 +5150,13 @@ def generate_configuration_tab(state, blocks, header, model_choice):
label="Transformer Engine Calculation"
)
- index = text_encoder_choices.index(text_encoder_filename)
- index = 0 if index ==0 else index
- text_encoder_choice = gr.Dropdown(
+
+ text_encoder_quantization_choice = gr.Dropdown(
choices=[
- ("UMT5 XXL 16 bits - unquantized text encoder, better quality uses more RAM", 0),
- ("UMT5 XXL quantized to 8 bits - quantized text encoder, slightly worse quality but uses less RAM", 1),
+ ("16 bits - unquantized text encoder, better quality uses more RAM", "bf16"),
+ ("8 bits - quantized text encoder, slightly worse quality but uses less RAM", "int8"),
],
- value= index,
+ value= text_encoder_quantization,
label="Text Encoder model"
)
@@ -4693,8 +5172,8 @@ def generate_configuration_tab(state, blocks, header, model_choice):
gr.Text("Beware: when restarting the server or changing a resolution or video duration, the first step of generation for a duration / resolution may last a few minutes due to recompilation", interactive= False, show_label= False )
compile_choice = gr.Dropdown(
choices=[
- ("ON: works only on Linux / WSL", "transformer"),
- ("OFF: no other choice if you have Windows without using WSL", "" ),
+ ("On (requires to have Triton installed)", "transformer"),
+ ("Off", "" ),
],
value= compile,
label="Compile Transformer (up to 50% faster and 30% more frames but requires Linux / WSL and Flash or Sage attention)",
@@ -4744,7 +5223,8 @@ def generate_configuration_tab(state, blocks, header, model_choice):
inputs=[
state,
transformer_types_choices,
- text_encoder_choice,
+ transformer_dtype_policy_choice,
+ text_encoder_quantization_choice,
VAE_precision_choice,
mixed_precision_choice,
save_path_choice,
@@ -4758,9 +5238,10 @@ def generate_configuration_tab(state, blocks, header, model_choice):
clear_file_list_choice,
preload_model_policy_choice,
UI_theme_choice,
+ enhancer_enabled_choice,
fit_canvas_choice
],
- outputs= [msg , header, model_choice]
+ outputs= [msg , header, model_choice, prompt_enhancer_row]
)
def generate_about_tab():
@@ -4798,7 +5279,7 @@ def generate_dropdown_model_list():
dropdown_types.append(current_model_type)
model_list = []
for model_type in dropdown_types:
- choice = get_model_filename(model_type, transformer_quantization)
+ choice = get_model_filename(model_type, transformer_quantization, transformer_dtype_policy)
model_list.append(choice)
dropdown_choices = [ ( get_model_name(choice), get_model_type(choice) ) for choice in model_list]
return gr.Dropdown(
@@ -5142,7 +5623,7 @@ def create_demo():
theme = gr.themes.Soft(font=["Verdana"], primary_hue="sky", neutral_hue="slate", text_size="md")
with gr.Blocks(css=css, theme=theme, title= "Wan2GP") as main:
- gr.Markdown("WanGP v4.5 by DeepBeepMeep ") # (Updates)
")
+ gr.Markdown("WanGP v5.0 by DeepBeepMeep ") # (Updates)
")
global model_list
tab_state = gr.State({ "tab_no":0 })
@@ -5161,7 +5642,7 @@ def create_demo():
header = gr.Markdown(generate_header(transformer_filename, compile, attention_mode), visible= True)
with gr.Row():
( state, loras_choices, lset_name, state,
- video_guide, video_mask, video_prompt_type_video_trigger
+ video_guide, video_mask, video_prompt_type_video_trigger, prompt_enhancer_row
) = generate_video_tab(model_choice=model_choice, header=header, main = main)
with gr.Tab("Informations", id="info"):
generate_info_tab()
@@ -5174,7 +5655,7 @@ def create_demo():
with gr.Tab("Downloads", id="downloads") as downloads_tab:
generate_download_tab(lset_name, loras_choices, state)
with gr.Tab("Configuration", id="configuration"):
- generate_configuration_tab(state, main, header, model_choice)
+ generate_configuration_tab(state, main, header, model_choice, prompt_enhancer_row)
with gr.Tab("About"):
generate_about_tab()