Spaces:

wedyanessam
/

Real_Time_Interactive_Avatar_v2

Runtime error

App Files Files Community

wedyanessam commited on May 27

Commit

206dfd7

verified ·

1 Parent(s): 5b038c7

Upload 40 files

Browse files

Files changed (41) hide show

.gitattributes +4 -0
FantasyTalking/.pre-commit-config.yaml +23 -0
FantasyTalking/LICENSE +201 -0
FantasyTalking/README.md +95 -0
FantasyTalking/README_zh.md +94 -0
FantasyTalking/app.py +314 -0
FantasyTalking/assets/audios/woman.wav +3 -0
FantasyTalking/assets/images/fig0_1_0.png +3 -0
FantasyTalking/assets/images/woman.png +3 -0
FantasyTalking/assets/overview.png +3 -0
FantasyTalking/diffsynth/__init__.py +5 -0
FantasyTalking/diffsynth/configs/__init__.py +0 -0
FantasyTalking/diffsynth/configs/model_config.py +1577 -0
FantasyTalking/diffsynth/data/__init__.py +1 -0
FantasyTalking/diffsynth/data/video.py +188 -0
FantasyTalking/diffsynth/models/__init__.py +1 -0
FantasyTalking/diffsynth/models/downloader.py +124 -0
FantasyTalking/diffsynth/models/model_manager.py +582 -0
FantasyTalking/diffsynth/models/utils.py +217 -0
FantasyTalking/diffsynth/models/wan_video_dit.py +998 -0
FantasyTalking/diffsynth/models/wan_video_image_encoder.py +960 -0
FantasyTalking/diffsynth/models/wan_video_text_encoder.py +289 -0
FantasyTalking/diffsynth/models/wan_video_vae.py +948 -0
FantasyTalking/diffsynth/pipelines/__init__.py +1 -0
FantasyTalking/diffsynth/pipelines/base.py +173 -0
FantasyTalking/diffsynth/pipelines/wan_video.py +389 -0
FantasyTalking/diffsynth/prompters/__init__.py +1 -0
FantasyTalking/diffsynth/prompters/base_prompter.py +69 -0
FantasyTalking/diffsynth/prompters/wan_prompter.py +114 -0
FantasyTalking/diffsynth/schedulers/__init__.py +3 -0
FantasyTalking/diffsynth/schedulers/continuous_ode.py +61 -0
FantasyTalking/diffsynth/schedulers/ddim.py +138 -0
FantasyTalking/diffsynth/schedulers/flow_match.py +97 -0
FantasyTalking/diffsynth/vram_management/__init__.py +1 -0
FantasyTalking/diffsynth/vram_management/layers.py +177 -0
FantasyTalking/infer.py +236 -0
FantasyTalking/infer.sh +11 -0
FantasyTalking/infer_24G.sh +12 -0
FantasyTalking/model.py +228 -0
FantasyTalking/requirements.txt +14 -0
FantasyTalking/utils.py +52 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 TTS/IMG_6935.wav filter=lfs diff=lfs merge=lfs -text
 TTS_X/IMG_6935.wav filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 TTS/IMG_6935.wav filter=lfs diff=lfs merge=lfs -text
 TTS_X/IMG_6935.wav filter=lfs diff=lfs merge=lfs -text
+FantasyTalking/assets/audios/woman.wav filter=lfs diff=lfs merge=lfs -text
+FantasyTalking/assets/images/fig0_1_0.png filter=lfs diff=lfs merge=lfs -text
+FantasyTalking/assets/images/woman.png filter=lfs diff=lfs merge=lfs -text
+FantasyTalking/assets/overview.png filter=lfs diff=lfs merge=lfs -text

FantasyTalking/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: check-added-large-files
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/psf/black
+    rev: 23.10.0
+    hooks:
+      - id: black
+  - repo: https://github.com/pycqa/flake8
+    rev: 6.1.0
+    hooks:
+      - id: flake8
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort

FantasyTalking/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright AMAP, Alibaba Group
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

FantasyTalking/README.md ADDED Viewed

	@@ -0,0 +1,95 @@

+[中文阅读](./README_zh.md)
+# FantasyTalking: Realistic Talking Portrait Generation via Coherent Motion Synthesis
+[![Home Page](https://img.shields.io/badge/Project-FantasyTalking-blue.svg)](https://fantasy-amap.github.io/fantasy-talking/)
+[![arXiv](https://img.shields.io/badge/Arxiv-2504.04842-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2504.04842)
+[![hf_paper](https://img.shields.io/badge/🤗-FantasyTalking-red.svg)](https://huggingface.co/acvlab/FantasyTalking)
+## 🔥 Latest News!!
+* April 29, 2025: Our work is merged to [ComfyUI-Wan](https://github.com/kijai/ComfyUI-WanVideoWrapper) ! Thank [kijai](https://github.com/kijai) for the update 👏!
+* April 28, 2025: We released the inference code and model weights for audio conditions.
+## Quickstart
+### 🛠️Installation
+Clone the repo:
+```
+git clone https://github.com/Fantasy-AMAP/fantasy-talking.git
+cd fantasy-talking
+```
+Install dependencies:
+```
+# Ensure torch >= 2.0.0
+pip install -r requirements.txt
+# Optional to install flash_attn to accelerate attention computation
+pip install flash_attn
+```
+### 🧱Model Download
+| Models        |                       Download Link                                           |    Notes                      |
+| --------------|-------------------------------------------------------------------------------|-------------------------------|
+| Wan2.1-I2V-14B-720P  |      🤗 [Huggingface](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P)    🤖 [ModelScope](https://www.modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P)     | Base model
+| Wav2Vec |      🤗 [Huggingface](https://huggingface.co/facebook/wav2vec2-base-960h)    🤖 [ModelScope](https://modelscope.cn/models/AI-ModelScope/wav2vec2-base-960h)      | Audio encoder
+| FantasyTalking model      |      🤗 [Huggingface](https://huggingface.co/acvlab/FantasyTalking/)     🤖 [ModelScope](https://www.modelscope.cn/models/amap_cvlab/FantasyTalking/)         | Our audio condition weights
+Download models using huggingface-cli:
+``` sh
+pip install "huggingface_hub[cli]"
+huggingface-cli download Wan-AI/Wan2.1-I2V-14B-720P --local-dir ./models/Wan2.1-I2V-14B-720P
+huggingface-cli download facebook/wav2vec2-base-960h --local-dir ./models/wav2vec2-base-960h
+huggingface-cli download acvlab/FantasyTalking fantasytalking_model.ckpt --local-dir ./models
+```
+Download models using modelscope-cli:
+``` sh
+pip install modelscope
+modelscope download Wan-AI/Wan2.1-I2V-14B-720P --local_dir ./models/Wan2.1-I2V-14B-720P
+modelscope download AI-ModelScope/wav2vec2-base-960h --local_dir ./models/wav2vec2-base-960h
+modelscope download amap_cvlab/FantasyTalking   fantasytalking_model.ckpt  --local_dir ./models
+```
+### 🔑 Inference
+``` sh
+python infer.py  --image_path ./assets/images/woman.png --audio_path ./assets/audios/woman.wav
+```
+You can control the character's behavior through the prompt. **The recommended range for prompt and audio cfg is [3-7]. You can increase the audio cfg to achieve more consistent lip-sync.**
+``` sh
+python infer.py  --image_path ./assets/images/woman.png --audio_path ./assets/audios/woman.wav --prompt "The person is speaking enthusiastically, with their hands continuously waving." --prompt_cfg_scale 5.0 --audio_cfg_scale 5.0
+```
+We present a detailed table here. The model is tested on a single A100.(512x512, 81 frames).
+|`torch_dtype`|`num_persistent_param_in_dit`|Speed|Required VRAM|
+|-|-|-|-|
+|torch.bfloat16|None (unlimited)|15.5s/it|40G|
+|torch.bfloat16|7*10**9 (7B)|32.8s/it|20G|
+|torch.bfloat16|0|42.6s/it|5G|
+### Gradio Demo
+We construct an [online demo](https://huggingface.co/spaces/acvlab/FantasyTalking) in Huggingface.
+For the local gradio demo, you can run:
+``` sh
+pip install gradio spaces
+python app.py
+```
+## 🧩 Community Works
+We ❤️ contributions from the open-source community! If your work has improved FantasyTalking, please inform us.
+Or you can directly e-mail [[email protected]](mailto://[email protected]). We are happy to reference your project for everyone's convenience.
+## 🔗Citation
+If you find this repository useful, please consider giving a star ⭐ and citation
+```
+@article{wang2025fantasytalking,
+   title={FantasyTalking: Realistic Talking Portrait Generation via Coherent Motion Synthesis},
+   author={Wang, Mengchao and Wang, Qiang and Jiang, Fan and Fan, Yaqi and Zhang, Yunpeng and Qi, Yonggang and Zhao, Kun and Xu, Mu},
+   journal={arXiv preprint arXiv:2504.04842},
+   year={2025}
+ }
+```
+## Acknowledgments
+Thanks to [Wan2.1](https://github.com/Wan-Video/Wan2.1), [HunyuanVideo](https://github.com/Tencent/HunyuanVideo), and [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) for open-sourcing their models and code, which provided valuable references and support for this project. Their contributions to the open-source community are truly appreciated.

FantasyTalking/README_zh.md ADDED Viewed

	@@ -0,0 +1,94 @@

+[中文阅读](./README_zh.md)
+# FantasyTalking: Realistic Talking Portrait Generation via Coherent Motion Synthesis
+[![Home Page](https://img.shields.io/badge/Project-FantasyTalking-blue.svg)](https://fantasy-amap.github.io/fantasy-talking/)
+[![arXiv](https://img.shields.io/badge/Arxiv-2504.04842-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2504.04842)
+[![hf_paper](https://img.shields.io/badge/🤗-FantasyTalking-red.svg)](https://huggingface.co/acvlab/FantasyTalking)
+## 🔥 Latest News!!
+* 2025年4月29日: 我们的工作被加入到[ComfyUI-Wan](https://github.com/kijai/ComfyUI-WanVideoWrapper) ! 感谢 [kijai](https://github.com/kijai) 更新 👏!
+* 2025年4月28日: 开源了音频条件下的推理代码和模型权重。
+## 快速开始
+### 🛠️安装和依赖
+首先克隆git仓库：
+```
+git clone https://github.com/Fantasy-AMAP/fantasy-talking.git
+cd fantasy-talking
+```
+安装依赖：
+```
+# Ensure torch >= 2.0.0
+pip install -r requirements.txt
+# 可选安装 flash_attn 以加速注意力计算
+pip install flash_attn
+```
+### 🧱模型下载
+| 模型        |                       下载链接                                          |    备注                      |
+| --------------|-------------------------------------------------------------------------------|-------------------------------|
+| Wan2.1-I2V-14B-720P  |      🤗 [Huggingface](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P)    🤖 [ModelScope](https://www.modelscope.cn/models/Wan-AI/Wan2.1-I2V-14B-720P)     | 基础模型
+| Wav2Vec |      🤗 [Huggingface](https://huggingface.co/facebook/wav2vec2-base-960h)    🤖 [ModelScope](https://modelscope.cn/models/AI-ModelScope/wav2vec2-base-960h)      | 音频编码器
+| FantasyTalking model      |      🤗 [Huggingface](https://huggingface.co/acvlab/FantasyTalking/)     🤖 [ModelScope](https://www.modelscope.cn/models/amap_cvlab/FantasyTalking/)         | 我们的音频条件权重
+使用huggingface-cli下载模型：
+``` sh
+pip install "huggingface_hub[cli]"
+huggingface-cli download Wan-AI/Wan2.1-I2V-14B-720P --local-dir ./models/Wan2.1-I2V-14B-720P
+huggingface-cli download facebook/wav2vec2-base-960h --local-dir ./models/wav2vec2-base-960h
+huggingface-cli download acvlab/FantasyTalking fantasytalking_model.ckpt --local-dir ./models
+```
+使用modelscope-cli下载模型：
+``` sh
+pip install modelscope
+modelscope download Wan-AI/Wan2.1-I2V-14B-720P --local_dir ./models/Wan2.1-I2V-14B-720P
+modelscope download AI-ModelScope/wav2vec2-base-960h --local_dir ./models/wav2vec2-base-960h
+modelscope download amap_cvlab/FantasyTalking   fantasytalking_model.ckpt  --local_dir ./models
+```
+### 🔑 推理
+``` sh
+python infer.py  --image_path ./assets/images/woman.png --audio_path ./assets/audios/woman.wav
+```
+您可以通过提示控制角色的行为。**提示和音频cfg的推荐范围是[3-7]。你可以通过调高音频cfg获得更一致的口型同步。**
+``` sh
+python infer.py  --image_path ./assets/images/woman.png --audio_path ./assets/audios/woman.wav --prompt "The person is speaking enthusiastically, with their hands continuously waving." --prompt_cfg_scale 5.0 --audio_cfg_scale 5.0
+```
+我们在此处提供了一个详细的表格。该模型在单个A100上进行了测试。(512x512，81帧)
+|`torch_dtype`|`num_persistent_param_in_dit`|Speed|Required VRAM|
+|-|-|-|-|
+|torch.bfloat16|None (unlimited)|15.5s/it|40G|
+|torch.bfloat16|7*10**9 (7B)|32.8s/it|20G|
+|torch.bfloat16|0|42.6s/it|5G|
+### Gradio 示例
+我们构建了一个Huggingface[在线演示](https://huggingface.co/spaces/acvlab/FantasyTalking)。
+对于本地的gradio演示，您可以运行：
+``` sh
+pip install gradio spaces
+python app.py
+```
+## 🧩 社区工作
+我们❤️喜欢来自开源社区的贡献！如果你的工作改进了FantasyTalking，请告诉我们。
+## 🔗Citation
+如果您发现此存储库有用，请考虑给出一个星号⭐和引用：
+```
+@article{wang2025fantasytalking,
+   title={FantasyTalking: Realistic Talking Portrait Generation via Coherent Motion Synthesis},
+   author={Wang, Mengchao and Wang, Qiang and Jiang, Fan and Fan, Yaqi and Zhang, Yunpeng and Qi, Yonggang and Zhao, Kun and Xu, Mu},
+   journal={arXiv preprint arXiv:2504.04842},
+   year={2025}
+ }
+```
+## 致谢
+感谢[Wan2.1](https://github.com/Wan-Video/Wan2.1)、[HunyuanVideo](https://github.com/Tencent/HunyuanVideo)和[DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio)开源他们的模型和代码，为该项目提供了宝贵的参考和支持。他们对开源社区的贡献真正值得赞赏。

FantasyTalking/app.py ADDED Viewed

	@@ -0,0 +1,314 @@

+# Copyright Alibaba Inc. All Rights Reserved.
+import argparse
+from datetime import datetime
+from pathlib import Path
+import gradio as gr
+import librosa
+from infer import load_models, main
+pipe, fantasytalking, wav2vec_processor, wav2vec = None, None, None, None
+def generate_video(
+    image_path,
+    audio_path,
+    prompt,
+    prompt_cfg_scale,
+    audio_cfg_scale,
+    audio_weight,
+    image_size,
+    max_num_frames,
+    inference_steps,
+    seed,
+):
+    # Create the temp directory if it doesn't exist
+    output_dir = Path("./output")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Convert paths to absolute Path objects and normalize them
+    print(image_path)
+    image_path = Path(image_path).absolute().as_posix()
+    audio_path = Path(audio_path).absolute().as_posix()
+    # Parse the arguments
+    args = create_args(
+        image_path=image_path,
+        audio_path=audio_path,
+        prompt=prompt,
+        output_dir=str(output_dir),
+        audio_weight=audio_weight,
+        prompt_cfg_scale=prompt_cfg_scale,
+        audio_cfg_scale=audio_cfg_scale,
+        image_size=image_size,
+        max_num_frames=max_num_frames,
+        inference_steps=inference_steps,
+        seed=seed,
+    )
+    try:
+        global pipe, fantasytalking, wav2vec_processor, wav2vec
+        if pipe is None:
+            pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(args)
+        output_path = main(args, pipe, fantasytalking, wav2vec_processor, wav2vec)
+        return output_path  # Ensure the output path is returned
+    except Exception as e:
+        print(f"Error during processing: {str(e)}")
+        raise gr.Error(f"Error during processing: {str(e)}")
+def create_args(
+    image_path: str,
+    audio_path: str,
+    prompt: str,
+    output_dir: str,
+    audio_weight: float,
+    prompt_cfg_scale: float,
+    audio_cfg_scale: float,
+    image_size: int,
+    max_num_frames: int,
+    inference_steps: int,
+    seed: int,
+) -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--wan_model_dir",
+        type=str,
+        default="./models/Wan2.1-I2V-14B-720P",
+        required=False,
+        help="The dir of the Wan I2V 14B model.",
+    )
+    parser.add_argument(
+        "--fantasytalking_model_path",
+        type=str,
+        default="./models/fantasytalking_model.ckpt",
+        required=False,
+        help="The .ckpt path of fantasytalking model.",
+    )
+    parser.add_argument(
+        "--wav2vec_model_dir",
+        type=str,
+        default="./models/wav2vec2-base-960h",
+        required=False,
+        help="The dir of wav2vec model.",
+    )
+    parser.add_argument(
+        "--image_path",
+        type=str,
+        default="./assets/images/woman.png",
+        required=False,
+        help="The path of the image.",
+    )
+    parser.add_argument(
+        "--audio_path",
+        type=str,
+        default="./assets/audios/woman.wav",
+        required=False,
+        help="The path of the audio.",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="A woman is talking.",
+        required=False,
+        help="prompt.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./output",
+        help="Dir to save the video.",
+    )
+    parser.add_argument(
+        "--image_size",
+        type=int,
+        default=512,
+        help="The image will be resized proportionally to this size.",
+    )
+    parser.add_argument(
+        "--audio_scale",
+        type=float,
+        default=1.0,
+        help="Image width.",
+    )
+    parser.add_argument(
+        "--prompt_cfg_scale",
+        type=float,
+        default=5.0,
+        required=False,
+        help="prompt cfg scale",
+    )
+    parser.add_argument(
+        "--audio_cfg_scale",
+        type=float,
+        default=5.0,
+        required=False,
+        help="audio cfg scale",
+    )
+    parser.add_argument(
+        "--max_num_frames",
+        type=int,
+        default=81,
+        required=False,
+        help="The maximum frames for generating videos, the audio part exceeding max_num_frames/fps will be truncated.",
+    )
+    parser.add_argument(
+        "--inference_steps",
+        type=int,
+        default=20,
+        required=False,
+    )
+    parser.add_argument(
+        "--fps",
+        type=int,
+        default=23,
+        required=False,
+    )
+    parser.add_argument(
+        "--num_persistent_param_in_dit",
+        type=int,
+        default=None,
+        required=False,
+        help="Maximum parameter quantity retained in video memory, small number to reduce VRAM required",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=1111,
+        required=False,
+    )
+    args = parser.parse_args(
+        [
+            "--image_path",
+            image_path,
+            "--audio_path",
+            audio_path,
+            "--prompt",
+            prompt,
+            "--output_dir",
+            output_dir,
+            "--image_size",
+            str(image_size),
+            "--audio_scale",
+            str(audio_weight),
+            "--prompt_cfg_scale",
+            str(prompt_cfg_scale),
+            "--audio_cfg_scale",
+            str(audio_cfg_scale),
+            "--max_num_frames",
+            str(max_num_frames),
+            "--inference_steps",
+            str(inference_steps),
+            "--seed",
+            str(seed),
+        ]
+    )
+    print(args)
+    return args
+# Create Gradio interface
+with gr.Blocks(title="FantasyTalking Video Generation") as demo:
+    gr.Markdown(
+        """
+    # FantasyTalking: Realistic Talking Portrait Generation via Coherent Motion Synthesis
+    <div align="center">
+        <strong> Mengchao Wang1*  Qiang Wang1*  Fan Jiang1†
+        Yaqi Fan2    Yunpeng Zhang1,2   YongGang Qi2‡
+        Kun Zhao1.   Mu Xu1 </strong>
+    </div>
+    <div align="center">
+        <strong>1AMAP,Alibaba Group   2Beijing University of Posts and Telecommunications</strong>
+    </div>
+    <div style="display:flex;justify-content:center;column-gap:4px;">
+        <a href="https://github.com/Fantasy-AMAP/fantasy-talking">
+            <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
+        </a>
+        <a href="https://arxiv.org/abs/2504.04842">
+            <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
+        </a>
+    </div>
+    """
+    )
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(label="Input Image", type="filepath")
+            audio_input = gr.Audio(label="Input Audio", type="filepath")
+            prompt_input = gr.Text(label="Input Prompt")
+            with gr.Row():
+                prompt_cfg_scale = gr.Slider(
+                    minimum=1.0,
+                    maximum=9.0,
+                    value=5.0,
+                    step=0.5,
+                    label="Prompt CFG Scale",
+                )
+                audio_cfg_scale = gr.Slider(
+                    minimum=1.0,
+                    maximum=9.0,
+                    value=5.0,
+                    step=0.5,
+                    label="Audio CFG Scale",
+                )
+                audio_weight = gr.Slider(
+                    minimum=0.1,
+                    maximum=3.0,
+                    value=1.0,
+                    step=0.1,
+                    label="Audio Weight",
+                )
+            with gr.Row():
+                image_size = gr.Number(
+                    value=512, label="Width/Height Maxsize", precision=0
+                )
+                max_num_frames = gr.Number(
+                    value=81, label="The Maximum Frames", precision=0
+                )
+                inference_steps = gr.Slider(
+                    minimum=1, maximum=50, value=20, step=1, label="Inference Steps"
+                )
+            with gr.Row():
+                seed = gr.Number(value=1247, label="Random Seed", precision=0)
+            process_btn = gr.Button("Generate Video")
+        with gr.Column():
+            video_output = gr.Video(label="Output Video")
+            gr.Examples(
+                examples=[
+                    [
+                        "assets/images/woman.png",
+                        "assets/audios/woman.wav",
+                    ],
+                ],
+                inputs=[image_input, audio_input],
+            )
+    process_btn.click(
+        fn=generate_video,
+        inputs=[
+            image_input,
+            audio_input,
+            prompt_input,
+            prompt_cfg_scale,
+            audio_cfg_scale,
+            audio_weight,
+            image_size,
+            max_num_frames,
+            inference_steps,
+            seed,
+        ],
+        outputs=video_output,
+    )
+if __name__ == "__main__":
+    demo.launch(inbrowser=True, share=True)

FantasyTalking/assets/audios/woman.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e08584293621824d039c264132d90b654bede740f67d9384979544e3e2abfacc
+size 1765454

FantasyTalking/assets/images/fig0_1_0.png ADDED Viewed

Git LFS Details

SHA256: b7eb9cfe91e7be5d175afa8d6c464b0e64638813271062ee0429370bf757a555
Pointer size: 132 Bytes
Size of remote file: 1.67 MB

FantasyTalking/assets/images/woman.png ADDED Viewed

Git LFS Details

SHA256: add373b3b48fa76ac760f60da302bcf402bfbb77eccecae6b87b861f7d0825de
Pointer size: 132 Bytes
Size of remote file: 1.1 MB

FantasyTalking/assets/overview.png ADDED Viewed

Git LFS Details

SHA256: b7eb9cfe91e7be5d175afa8d6c464b0e64638813271062ee0429370bf757a555
Pointer size: 132 Bytes
Size of remote file: 1.67 MB

FantasyTalking/diffsynth/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .data import *
+from .models import *
+from .pipelines import *
+from .prompters import *
+from .schedulers import *

FantasyTalking/diffsynth/configs/__init__.py ADDED Viewed

File without changes

FantasyTalking/diffsynth/configs/model_config.py ADDED Viewed

	@@ -0,0 +1,1577 @@

+from typing_extensions import Literal, TypeAlias
+from ..models.wan_video_dit import WanModel
+from ..models.wan_video_image_encoder import WanImageEncoder
+from ..models.wan_video_text_encoder import WanTextEncoder
+from ..models.wan_video_vae import WanVideoVAE
+model_loader_configs = [
+    # These configs are provided for detecting model type automatically.
+    # The format is (state_dict_keys_hash, state_dict_keys_hash_with_shape, model_names, model_classes, model_resource)
+    (
+        None,
+        "9269f8db9040a9d860eaca435be61814",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "aafcfd9672c3a2456dc46e1cb6e52c70",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "6bfcfb3b342cb286ce886889d519a77e",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "9c8818c2cbea55eca56c7b447df170da",
+        ["wan_video_text_encoder"],
+        [WanTextEncoder],
+        "civitai",
+    ),
+    (
+        None,
+        "5941c53e207d62f20f9025686193c40b",
+        ["wan_video_image_encoder"],
+        [WanImageEncoder],
+        "civitai",
+    ),
+    (
+        None,
+        "1378ea763357eea97acdef78e65d6d96",
+        ["wan_video_vae"],
+        [WanVideoVAE],
+        "civitai",
+    ),
+    (
+        None,
+        "ccc42284ea13e1ad04693284c7a09be6",
+        ["wan_video_vae"],
+        [WanVideoVAE],
+        "civitai",
+    ),
+]
+huggingface_model_loader_configs = [
+    # These configs are provided for detecting model type automatically.
+    # The format is (architecture_in_huggingface_config, huggingface_lib, model_name, redirected_architecture)
+    (
+        "ChatGLMModel",
+        "diffsynth.models.kolors_text_encoder",
+        "kolors_text_encoder",
+        None,
+    ),
+    ("MarianMTModel", "transformers.models.marian.modeling_marian", "translator", None),
+    (
+        "BloomForCausalLM",
+        "transformers.models.bloom.modeling_bloom",
+        "beautiful_prompt",
+        None,
+    ),
+    (
+        "Qwen2ForCausalLM",
+        "transformers.models.qwen2.modeling_qwen2",
+        "qwen_prompt",
+        None,
+    ),
+    # ("LlamaForCausalLM", "transformers.models.llama.modeling_llama", "omost_prompt", None),
+    (
+        "T5EncoderModel",
+        "diffsynth.models.flux_text_encoder",
+        "flux_text_encoder_2",
+        "FluxTextEncoder2",
+    ),
+    ("CogVideoXTransformer3DModel", "diffsynth.models.cog_dit", "cog_dit", "CogDiT"),
+    (
+        "SiglipModel",
+        "transformers.models.siglip.modeling_siglip",
+        "siglip_vision_model",
+        "SiglipVisionModel",
+    ),
+    (
+        "LlamaForCausalLM",
+        "diffsynth.models.hunyuan_video_text_encoder",
+        "hunyuan_video_text_encoder_2",
+        "HunyuanVideoLLMEncoder",
+    ),
+    (
+        "Step1Model",
+        "diffsynth.models.stepvideo_text_encoder",
+        "stepvideo_text_encoder_2",
+        "STEP1TextEncoder",
+    ),
+]
+patch_model_loader_configs = [
+    # These configs are provided for detecting model type automatically.
+    # The format is (state_dict_keys_hash_with_shape, model_name, model_class, extra_kwargs)
+    # ("9a4ab6869ac9b7d6e31f9854e397c867", ["svd_unet"], [SVDUNet], {"add_positional_conv": 128}),
+]
+preset_models_on_huggingface = {
+    "HunyuanDiT": [
+        (
+            "Tencent-Hunyuan/HunyuanDiT",
+            "t2i/clip_text_encoder/pytorch_model.bin",
+            "models/HunyuanDiT/t2i/clip_text_encoder",
+        ),
+        (
+            "Tencent-Hunyuan/HunyuanDiT",
+            "t2i/mt5/pytorch_model.bin",
+            "models/HunyuanDiT/t2i/mt5",
+        ),
+        (
+            "Tencent-Hunyuan/HunyuanDiT",
+            "t2i/model/pytorch_model_ema.pt",
+            "models/HunyuanDiT/t2i/model",
+        ),
+        (
+            "Tencent-Hunyuan/HunyuanDiT",
+            "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin",
+            "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix",
+        ),
+    ],
+    "stable-video-diffusion-img2vid-xt": [
+        (
+            "stabilityai/stable-video-diffusion-img2vid-xt",
+            "svd_xt.safetensors",
+            "models/stable_video_diffusion",
+        ),
+    ],
+    "ExVideo-SVD-128f-v1": [
+        (
+            "ECNU-CILab/ExVideo-SVD-128f-v1",
+            "model.fp16.safetensors",
+            "models/stable_video_diffusion",
+        ),
+    ],
+    # Stable Diffusion
+    "StableDiffusion_v15": [
+        (
+            "benjamin-paine/stable-diffusion-v1-5",
+            "v1-5-pruned-emaonly.safetensors",
+            "models/stable_diffusion",
+        ),
+    ],
+    "DreamShaper_8": [
+        ("Yntec/Dreamshaper8", "dreamshaper_8.safetensors", "models/stable_diffusion"),
+    ],
+    # Textual Inversion
+    "TextualInversion_VeryBadImageNegative_v1.3": [
+        (
+            "gemasai/verybadimagenegative_v1.3",
+            "verybadimagenegative_v1.3.pt",
+            "models/textual_inversion",
+        ),
+    ],
+    # Stable Diffusion XL
+    "StableDiffusionXL_v1": [
+        (
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            "sd_xl_base_1.0.safetensors",
+            "models/stable_diffusion_xl",
+        ),
+    ],
+    "BluePencilXL_v200": [
+        (
+            "frankjoshua/bluePencilXL_v200",
+            "bluePencilXL_v200.safetensors",
+            "models/stable_diffusion_xl",
+        ),
+    ],
+    "StableDiffusionXL_Turbo": [
+        (
+            "stabilityai/sdxl-turbo",
+            "sd_xl_turbo_1.0_fp16.safetensors",
+            "models/stable_diffusion_xl_turbo",
+        ),
+    ],
+    # Stable Diffusion 3
+    "StableDiffusion3": [
+        (
+            "stabilityai/stable-diffusion-3-medium",
+            "sd3_medium_incl_clips_t5xxlfp16.safetensors",
+            "models/stable_diffusion_3",
+        ),
+    ],
+    "StableDiffusion3_without_T5": [
+        (
+            "stabilityai/stable-diffusion-3-medium",
+            "sd3_medium_incl_clips.safetensors",
+            "models/stable_diffusion_3",
+        ),
+    ],
+    # ControlNet
+    "ControlNet_v11f1p_sd15_depth": [
+        (
+            "lllyasviel/ControlNet-v1-1",
+            "control_v11f1p_sd15_depth.pth",
+            "models/ControlNet",
+        ),
+        ("lllyasviel/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators"),
+    ],
+    "ControlNet_v11p_sd15_softedge": [
+        (
+            "lllyasviel/ControlNet-v1-1",
+            "control_v11p_sd15_softedge.pth",
+            "models/ControlNet",
+        ),
+        ("lllyasviel/Annotators", "ControlNetHED.pth", "models/Annotators"),
+    ],
+    "ControlNet_v11f1e_sd15_tile": [
+        (
+            "lllyasviel/ControlNet-v1-1",
+            "control_v11f1e_sd15_tile.pth",
+            "models/ControlNet",
+        )
+    ],
+    "ControlNet_v11p_sd15_lineart": [
+        (
+            "lllyasviel/ControlNet-v1-1",
+            "control_v11p_sd15_lineart.pth",
+            "models/ControlNet",
+        ),
+        ("lllyasviel/Annotators", "sk_model.pth", "models/Annotators"),
+        ("lllyasviel/Annotators", "sk_model2.pth", "models/Annotators"),
+    ],
+    "ControlNet_union_sdxl_promax": [
+        (
+            "xinsir/controlnet-union-sdxl-1.0",
+            "diffusion_pytorch_model_promax.safetensors",
+            "models/ControlNet/controlnet_union",
+        ),
+        ("lllyasviel/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators"),
+    ],
+    # AnimateDiff
+    "AnimateDiff_v2": [
+        ("guoyww/animatediff", "mm_sd_v15_v2.ckpt", "models/AnimateDiff"),
+    ],
+    "AnimateDiff_xl_beta": [
+        ("guoyww/animatediff", "mm_sdxl_v10_beta.ckpt", "models/AnimateDiff"),
+    ],
+    # Qwen Prompt
+    "QwenPrompt": [
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "config.json",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "generation_config.json",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "model.safetensors",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "special_tokens_map.json",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "tokenizer.json",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "tokenizer_config.json",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "merges.txt",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "vocab.json",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+    ],
+    # Beautiful Prompt
+    "BeautifulPrompt": [
+        (
+            "alibaba-pai/pai-bloom-1b1-text2prompt-sd",
+            "config.json",
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ),
+        (
+            "alibaba-pai/pai-bloom-1b1-text2prompt-sd",
+            "generation_config.json",
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ),
+        (
+            "alibaba-pai/pai-bloom-1b1-text2prompt-sd",
+            "model.safetensors",
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ),
+        (
+            "alibaba-pai/pai-bloom-1b1-text2prompt-sd",
+            "special_tokens_map.json",
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ),
+        (
+            "alibaba-pai/pai-bloom-1b1-text2prompt-sd",
+            "tokenizer.json",
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ),
+        (
+            "alibaba-pai/pai-bloom-1b1-text2prompt-sd",
+            "tokenizer_config.json",
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ),
+    ],
+    # Omost prompt
+    "OmostPrompt": [
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "model-00001-of-00002.safetensors",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "model-00002-of-00002.safetensors",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "tokenizer.json",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "tokenizer_config.json",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "config.json",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "generation_config.json",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "model.safetensors.index.json",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "special_tokens_map.json",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+    ],
+    # Translator
+    "opus-mt-zh-en": [
+        (
+            "Helsinki-NLP/opus-mt-zh-en",
+            "config.json",
+            "models/translator/opus-mt-zh-en",
+        ),
+        (
+            "Helsinki-NLP/opus-mt-zh-en",
+            "generation_config.json",
+            "models/translator/opus-mt-zh-en",
+        ),
+        (
+            "Helsinki-NLP/opus-mt-zh-en",
+            "metadata.json",
+            "models/translator/opus-mt-zh-en",
+        ),
+        (
+            "Helsinki-NLP/opus-mt-zh-en",
+            "pytorch_model.bin",
+            "models/translator/opus-mt-zh-en",
+        ),
+        ("Helsinki-NLP/opus-mt-zh-en", "source.spm", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "target.spm", "models/translator/opus-mt-zh-en"),
+        (
+            "Helsinki-NLP/opus-mt-zh-en",
+            "tokenizer_config.json",
+            "models/translator/opus-mt-zh-en",
+        ),
+        ("Helsinki-NLP/opus-mt-zh-en", "vocab.json", "models/translator/opus-mt-zh-en"),
+    ],
+    # IP-Adapter
+    "IP-Adapter-SD": [
+        (
+            "h94/IP-Adapter",
+            "models/image_encoder/model.safetensors",
+            "models/IpAdapter/stable_diffusion/image_encoder",
+        ),
+        (
+            "h94/IP-Adapter",
+            "models/ip-adapter_sd15.bin",
+            "models/IpAdapter/stable_diffusion",
+        ),
+    ],
+    "IP-Adapter-SDXL": [
+        (
+            "h94/IP-Adapter",
+            "sdxl_models/image_encoder/model.safetensors",
+            "models/IpAdapter/stable_diffusion_xl/image_encoder",
+        ),
+        (
+            "h94/IP-Adapter",
+            "sdxl_models/ip-adapter_sdxl.bin",
+            "models/IpAdapter/stable_diffusion_xl",
+        ),
+    ],
+    "SDXL-vae-fp16-fix": [
+        (
+            "madebyollin/sdxl-vae-fp16-fix",
+            "diffusion_pytorch_model.safetensors",
+            "models/sdxl-vae-fp16-fix",
+        )
+    ],
+    # Kolors
+    "Kolors": [
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/config.json",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model.bin.index.json",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model-00001-of-00007.bin",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model-00002-of-00007.bin",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model-00003-of-00007.bin",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model-00004-of-00007.bin",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model-00005-of-00007.bin",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model-00006-of-00007.bin",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model-00007-of-00007.bin",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "unet/diffusion_pytorch_model.safetensors",
+            "models/kolors/Kolors/unet",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "vae/diffusion_pytorch_model.safetensors",
+            "models/kolors/Kolors/vae",
+        ),
+    ],
+    # FLUX
+    "FLUX.1-dev": [
+        (
+            "black-forest-labs/FLUX.1-dev",
+            "text_encoder/model.safetensors",
+            "models/FLUX/FLUX.1-dev/text_encoder",
+        ),
+        (
+            "black-forest-labs/FLUX.1-dev",
+            "text_encoder_2/config.json",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+        ),
+        (
+            "black-forest-labs/FLUX.1-dev",
+            "text_encoder_2/model-00001-of-00002.safetensors",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+        ),
+        (
+            "black-forest-labs/FLUX.1-dev",
+            "text_encoder_2/model-00002-of-00002.safetensors",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+        ),
+        (
+            "black-forest-labs/FLUX.1-dev",
+            "text_encoder_2/model.safetensors.index.json",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+        ),
+        ("black-forest-labs/FLUX.1-dev", "ae.safetensors", "models/FLUX/FLUX.1-dev"),
+        (
+            "black-forest-labs/FLUX.1-dev",
+            "flux1-dev.safetensors",
+            "models/FLUX/FLUX.1-dev",
+        ),
+    ],
+    "InstantX/FLUX.1-dev-IP-Adapter": {
+        "file_list": [
+            (
+                "InstantX/FLUX.1-dev-IP-Adapter",
+                "ip-adapter.bin",
+                "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter",
+            ),
+            (
+                "google/siglip-so400m-patch14-384",
+                "model.safetensors",
+                "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+            ),
+            (
+                "google/siglip-so400m-patch14-384",
+                "config.json",
+                "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+            ),
+        ],
+        "load_path": [
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/ip-adapter.bin",
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+        ],
+    },
+    # RIFE
+    "RIFE": [
+        ("AlexWortega/RIFE", "flownet.pkl", "models/RIFE"),
+    ],
+    # CogVideo
+    "CogVideoX-5B": [
+        (
+            "THUDM/CogVideoX-5b",
+            "text_encoder/config.json",
+            "models/CogVideo/CogVideoX-5b/text_encoder",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "text_encoder/model.safetensors.index.json",
+            "models/CogVideo/CogVideoX-5b/text_encoder",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "text_encoder/model-00001-of-00002.safetensors",
+            "models/CogVideo/CogVideoX-5b/text_encoder",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "text_encoder/model-00002-of-00002.safetensors",
+            "models/CogVideo/CogVideoX-5b/text_encoder",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "transformer/config.json",
+            "models/CogVideo/CogVideoX-5b/transformer",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "transformer/diffusion_pytorch_model.safetensors.index.json",
+            "models/CogVideo/CogVideoX-5b/transformer",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "transformer/diffusion_pytorch_model-00001-of-00002.safetensors",
+            "models/CogVideo/CogVideoX-5b/transformer",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "transformer/diffusion_pytorch_model-00002-of-00002.safetensors",
+            "models/CogVideo/CogVideoX-5b/transformer",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "vae/diffusion_pytorch_model.safetensors",
+            "models/CogVideo/CogVideoX-5b/vae",
+        ),
+    ],
+    # Stable Diffusion 3.5
+    "StableDiffusion3.5-large": [
+        (
+            "stabilityai/stable-diffusion-3.5-large",
+            "sd3.5_large.safetensors",
+            "models/stable_diffusion_3",
+        ),
+        (
+            "stabilityai/stable-diffusion-3.5-large",
+            "text_encoders/clip_l.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "stabilityai/stable-diffusion-3.5-large",
+            "text_encoders/clip_g.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "stabilityai/stable-diffusion-3.5-large",
+            "text_encoders/t5xxl_fp16.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+    ],
+}
+preset_models_on_modelscope = {
+    # Hunyuan DiT
+    "HunyuanDiT": [
+        (
+            "modelscope/HunyuanDiT",
+            "t2i/clip_text_encoder/pytorch_model.bin",
+            "models/HunyuanDiT/t2i/clip_text_encoder",
+        ),
+        (
+            "modelscope/HunyuanDiT",
+            "t2i/mt5/pytorch_model.bin",
+            "models/HunyuanDiT/t2i/mt5",
+        ),
+        (
+            "modelscope/HunyuanDiT",
+            "t2i/model/pytorch_model_ema.pt",
+            "models/HunyuanDiT/t2i/model",
+        ),
+        (
+            "modelscope/HunyuanDiT",
+            "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin",
+            "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix",
+        ),
+    ],
+    # Stable Video Diffusion
+    "stable-video-diffusion-img2vid-xt": [
+        (
+            "AI-ModelScope/stable-video-diffusion-img2vid-xt",
+            "svd_xt.safetensors",
+            "models/stable_video_diffusion",
+        ),
+    ],
+    # ExVideo
+    "ExVideo-SVD-128f-v1": [
+        (
+            "ECNU-CILab/ExVideo-SVD-128f-v1",
+            "model.fp16.safetensors",
+            "models/stable_video_diffusion",
+        ),
+    ],
+    "ExVideo-CogVideoX-LoRA-129f-v1": [
+        (
+            "ECNU-CILab/ExVideo-CogVideoX-LoRA-129f-v1",
+            "ExVideo-CogVideoX-LoRA-129f-v1.safetensors",
+            "models/lora",
+        ),
+    ],
+    # Stable Diffusion
+    "StableDiffusion_v15": [
+        (
+            "AI-ModelScope/stable-diffusion-v1-5",
+            "v1-5-pruned-emaonly.safetensors",
+            "models/stable_diffusion",
+        ),
+    ],
+    "DreamShaper_8": [
+        (
+            "sd_lora/dreamshaper_8",
+            "dreamshaper_8.safetensors",
+            "models/stable_diffusion",
+        ),
+    ],
+    "AingDiffusion_v12": [
+        (
+            "sd_lora/aingdiffusion_v12",
+            "aingdiffusion_v12.safetensors",
+            "models/stable_diffusion",
+        ),
+    ],
+    "Flat2DAnimerge_v45Sharp": [
+        (
+            "sd_lora/Flat-2D-Animerge",
+            "flat2DAnimerge_v45Sharp.safetensors",
+            "models/stable_diffusion",
+        ),
+    ],
+    # Textual Inversion
+    "TextualInversion_VeryBadImageNegative_v1.3": [
+        (
+            "sd_lora/verybadimagenegative_v1.3",
+            "verybadimagenegative_v1.3.pt",
+            "models/textual_inversion",
+        ),
+    ],
+    # Stable Diffusion XL
+    "StableDiffusionXL_v1": [
+        (
+            "AI-ModelScope/stable-diffusion-xl-base-1.0",
+            "sd_xl_base_1.0.safetensors",
+            "models/stable_diffusion_xl",
+        ),
+    ],
+    "BluePencilXL_v200": [
+        (
+            "sd_lora/bluePencilXL_v200",
+            "bluePencilXL_v200.safetensors",
+            "models/stable_diffusion_xl",
+        ),
+    ],
+    "StableDiffusionXL_Turbo": [
+        (
+            "AI-ModelScope/sdxl-turbo",
+            "sd_xl_turbo_1.0_fp16.safetensors",
+            "models/stable_diffusion_xl_turbo",
+        ),
+    ],
+    "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0": [
+        (
+            "sd_lora/zyd232_ChineseInkStyle_SDXL_v1_0",
+            "zyd232_ChineseInkStyle_SDXL_v1_0.safetensors",
+            "models/lora",
+        ),
+    ],
+    # Stable Diffusion 3
+    "StableDiffusion3": [
+        (
+            "AI-ModelScope/stable-diffusion-3-medium",
+            "sd3_medium_incl_clips_t5xxlfp16.safetensors",
+            "models/stable_diffusion_3",
+        ),
+    ],
+    "StableDiffusion3_without_T5": [
+        (
+            "AI-ModelScope/stable-diffusion-3-medium",
+            "sd3_medium_incl_clips.safetensors",
+            "models/stable_diffusion_3",
+        ),
+    ],
+    # ControlNet
+    "ControlNet_v11f1p_sd15_depth": [
+        (
+            "AI-ModelScope/ControlNet-v1-1",
+            "control_v11f1p_sd15_depth.pth",
+            "models/ControlNet",
+        ),
+        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators"),
+    ],
+    "ControlNet_v11p_sd15_softedge": [
+        (
+            "AI-ModelScope/ControlNet-v1-1",
+            "control_v11p_sd15_softedge.pth",
+            "models/ControlNet",
+        ),
+        ("sd_lora/Annotators", "ControlNetHED.pth", "models/Annotators"),
+    ],
+    "ControlNet_v11f1e_sd15_tile": [
+        (
+            "AI-ModelScope/ControlNet-v1-1",
+            "control_v11f1e_sd15_tile.pth",
+            "models/ControlNet",
+        )
+    ],
+    "ControlNet_v11p_sd15_lineart": [
+        (
+            "AI-ModelScope/ControlNet-v1-1",
+            "control_v11p_sd15_lineart.pth",
+            "models/ControlNet",
+        ),
+        ("sd_lora/Annotators", "sk_model.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "sk_model2.pth", "models/Annotators"),
+    ],
+    "ControlNet_union_sdxl_promax": [
+        (
+            "AI-ModelScope/controlnet-union-sdxl-1.0",
+            "diffusion_pytorch_model_promax.safetensors",
+            "models/ControlNet/controlnet_union",
+        ),
+        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators"),
+    ],
+    "Annotators:Depth": [
+        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators"),
+    ],
+    "Annotators:Softedge": [
+        ("sd_lora/Annotators", "ControlNetHED.pth", "models/Annotators"),
+    ],
+    "Annotators:Lineart": [
+        ("sd_lora/Annotators", "sk_model.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "sk_model2.pth", "models/Annotators"),
+    ],
+    "Annotators:Normal": [
+        ("sd_lora/Annotators", "scannet.pt", "models/Annotators"),
+    ],
+    "Annotators:Openpose": [
+        ("sd_lora/Annotators", "body_pose_model.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "facenet.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "hand_pose_model.pth", "models/Annotators"),
+    ],
+    # AnimateDiff
+    "AnimateDiff_v2": [
+        (
+            "Shanghai_AI_Laboratory/animatediff",
+            "mm_sd_v15_v2.ckpt",
+            "models/AnimateDiff",
+        ),
+    ],
+    "AnimateDiff_xl_beta": [
+        (
+            "Shanghai_AI_Laboratory/animatediff",
+            "mm_sdxl_v10_beta.ckpt",
+            "models/AnimateDiff",
+        ),
+    ],
+    # RIFE
+    "RIFE": [
+        ("Damo_XR_Lab/cv_rife_video-frame-interpolation", "flownet.pkl", "models/RIFE"),
+    ],
+    # Qwen Prompt
+    "QwenPrompt": {
+        "file_list": [
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "config.json",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "generation_config.json",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "model.safetensors",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "special_tokens_map.json",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "tokenizer.json",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "tokenizer_config.json",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "merges.txt",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "vocab.json",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+        ],
+        "load_path": [
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ],
+    },
+    # Beautiful Prompt
+    "BeautifulPrompt": {
+        "file_list": [
+            (
+                "AI-ModelScope/pai-bloom-1b1-text2prompt-sd",
+                "config.json",
+                "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+            ),
+            (
+                "AI-ModelScope/pai-bloom-1b1-text2prompt-sd",
+                "generation_config.json",
+                "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+            ),
+            (
+                "AI-ModelScope/pai-bloom-1b1-text2prompt-sd",
+                "model.safetensors",
+                "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+            ),
+            (
+                "AI-ModelScope/pai-bloom-1b1-text2prompt-sd",
+                "special_tokens_map.json",
+                "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+            ),
+            (
+                "AI-ModelScope/pai-bloom-1b1-text2prompt-sd",
+                "tokenizer.json",
+                "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+            ),
+            (
+                "AI-ModelScope/pai-bloom-1b1-text2prompt-sd",
+                "tokenizer_config.json",
+                "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+            ),
+        ],
+        "load_path": [
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ],
+    },
+    # Omost prompt
+    "OmostPrompt": {
+        "file_list": [
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "model-00001-of-00002.safetensors",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "model-00002-of-00002.safetensors",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "tokenizer.json",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "tokenizer_config.json",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "config.json",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "generation_config.json",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "model.safetensors.index.json",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "special_tokens_map.json",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+        ],
+        "load_path": [
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ],
+    },
+    # Translator
+    "opus-mt-zh-en": {
+        "file_list": [
+            ("moxying/opus-mt-zh-en", "config.json", "models/translator/opus-mt-zh-en"),
+            (
+                "moxying/opus-mt-zh-en",
+                "generation_config.json",
+                "models/translator/opus-mt-zh-en",
+            ),
+            (
+                "moxying/opus-mt-zh-en",
+                "metadata.json",
+                "models/translator/opus-mt-zh-en",
+            ),
+            (
+                "moxying/opus-mt-zh-en",
+                "pytorch_model.bin",
+                "models/translator/opus-mt-zh-en",
+            ),
+            ("moxying/opus-mt-zh-en", "source.spm", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "target.spm", "models/translator/opus-mt-zh-en"),
+            (
+                "moxying/opus-mt-zh-en",
+                "tokenizer_config.json",
+                "models/translator/opus-mt-zh-en",
+            ),
+            ("moxying/opus-mt-zh-en", "vocab.json", "models/translator/opus-mt-zh-en"),
+        ],
+        "load_path": [
+            "models/translator/opus-mt-zh-en",
+        ],
+    },
+    # IP-Adapter
+    "IP-Adapter-SD": [
+        (
+            "AI-ModelScope/IP-Adapter",
+            "models/image_encoder/model.safetensors",
+            "models/IpAdapter/stable_diffusion/image_encoder",
+        ),
+        (
+            "AI-ModelScope/IP-Adapter",
+            "models/ip-adapter_sd15.bin",
+            "models/IpAdapter/stable_diffusion",
+        ),
+    ],
+    "IP-Adapter-SDXL": [
+        (
+            "AI-ModelScope/IP-Adapter",
+            "sdxl_models/image_encoder/model.safetensors",
+            "models/IpAdapter/stable_diffusion_xl/image_encoder",
+        ),
+        (
+            "AI-ModelScope/IP-Adapter",
+            "sdxl_models/ip-adapter_sdxl.bin",
+            "models/IpAdapter/stable_diffusion_xl",
+        ),
+    ],
+    # Kolors
+    "Kolors": {
+        "file_list": [
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/config.json",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model.bin.index.json",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model-00001-of-00007.bin",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model-00002-of-00007.bin",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model-00003-of-00007.bin",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model-00004-of-00007.bin",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model-00005-of-00007.bin",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model-00006-of-00007.bin",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model-00007-of-00007.bin",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "unet/diffusion_pytorch_model.safetensors",
+                "models/kolors/Kolors/unet",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "vae/diffusion_pytorch_model.safetensors",
+                "models/kolors/Kolors/vae",
+            ),
+        ],
+        "load_path": [
+            "models/kolors/Kolors/text_encoder",
+            "models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors",
+            "models/kolors/Kolors/vae/diffusion_pytorch_model.safetensors",
+        ],
+    },
+    "SDXL-vae-fp16-fix": [
+        (
+            "AI-ModelScope/sdxl-vae-fp16-fix",
+            "diffusion_pytorch_model.safetensors",
+            "models/sdxl-vae-fp16-fix",
+        )
+    ],
+    # FLUX
+    "FLUX.1-dev": {
+        "file_list": [
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder/model.safetensors",
+                "models/FLUX/FLUX.1-dev/text_encoder",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/config.json",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/model-00001-of-00002.safetensors",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/model-00002-of-00002.safetensors",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/model.safetensors.index.json",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            ("AI-ModelScope/FLUX.1-dev", "ae.safetensors", "models/FLUX/FLUX.1-dev"),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "flux1-dev.safetensors",
+                "models/FLUX/FLUX.1-dev",
+            ),
+        ],
+        "load_path": [
+            "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+            "models/FLUX/FLUX.1-dev/ae.safetensors",
+            "models/FLUX/FLUX.1-dev/flux1-dev.safetensors",
+        ],
+    },
+    "FLUX.1-schnell": {
+        "file_list": [
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder/model.safetensors",
+                "models/FLUX/FLUX.1-dev/text_encoder",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/config.json",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/model-00001-of-00002.safetensors",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/model-00002-of-00002.safetensors",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/model.safetensors.index.json",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            ("AI-ModelScope/FLUX.1-dev", "ae.safetensors", "models/FLUX/FLUX.1-dev"),
+            (
+                "AI-ModelScope/FLUX.1-schnell",
+                "flux1-schnell.safetensors",
+                "models/FLUX/FLUX.1-schnell",
+            ),
+        ],
+        "load_path": [
+            "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+            "models/FLUX/FLUX.1-dev/ae.safetensors",
+            "models/FLUX/FLUX.1-schnell/flux1-schnell.safetensors",
+        ],
+    },
+    "InstantX/FLUX.1-dev-Controlnet-Union-alpha": [
+        (
+            "InstantX/FLUX.1-dev-Controlnet-Union-alpha",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/InstantX/FLUX.1-dev-Controlnet-Union-alpha",
+        ),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Depth": [
+        (
+            "jasperai/Flux.1-dev-Controlnet-Depth",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Depth",
+        ),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Surface-Normals": [
+        (
+            "jasperai/Flux.1-dev-Controlnet-Surface-Normals",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Surface-Normals",
+        ),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Upscaler": [
+        (
+            "jasperai/Flux.1-dev-Controlnet-Upscaler",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Upscaler",
+        ),
+    ],
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha": [
+        (
+            "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha",
+        ),
+    ],
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta": [
+        (
+            "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta",
+        ),
+    ],
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Depth": [
+        (
+            "Shakker-Labs/FLUX.1-dev-ControlNet-Depth",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/Shakker-Labs/FLUX.1-dev-ControlNet-Depth",
+        ),
+    ],
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro": [
+        (
+            "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro",
+        ),
+    ],
+    "InstantX/FLUX.1-dev-IP-Adapter": {
+        "file_list": [
+            (
+                "InstantX/FLUX.1-dev-IP-Adapter",
+                "ip-adapter.bin",
+                "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter",
+            ),
+            (
+                "AI-ModelScope/siglip-so400m-patch14-384",
+                "model.safetensors",
+                "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+            ),
+            (
+                "AI-ModelScope/siglip-so400m-patch14-384",
+                "config.json",
+                "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+            ),
+        ],
+        "load_path": [
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/ip-adapter.bin",
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+        ],
+    },
+    # ESRGAN
+    "ESRGAN_x4": [
+        ("AI-ModelScope/Real-ESRGAN", "RealESRGAN_x4.pth", "models/ESRGAN"),
+    ],
+    # RIFE
+    "RIFE": [
+        ("AI-ModelScope/RIFE", "flownet.pkl", "models/RIFE"),
+    ],
+    # Omnigen
+    "OmniGen-v1": {
+        "file_list": [
+            (
+                "BAAI/OmniGen-v1",
+                "vae/diffusion_pytorch_model.safetensors",
+                "models/OmniGen/OmniGen-v1/vae",
+            ),
+            ("BAAI/OmniGen-v1", "model.safetensors", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "config.json", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "special_tokens_map.json", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "tokenizer_config.json", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "tokenizer.json", "models/OmniGen/OmniGen-v1"),
+        ],
+        "load_path": [
+            "models/OmniGen/OmniGen-v1/vae/diffusion_pytorch_model.safetensors",
+            "models/OmniGen/OmniGen-v1/model.safetensors",
+        ],
+    },
+    # CogVideo
+    "CogVideoX-5B": {
+        "file_list": [
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "text_encoder/config.json",
+                "models/CogVideo/CogVideoX-5b/text_encoder",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "text_encoder/model.safetensors.index.json",
+                "models/CogVideo/CogVideoX-5b/text_encoder",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "text_encoder/model-00001-of-00002.safetensors",
+                "models/CogVideo/CogVideoX-5b/text_encoder",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "text_encoder/model-00002-of-00002.safetensors",
+                "models/CogVideo/CogVideoX-5b/text_encoder",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "transformer/config.json",
+                "models/CogVideo/CogVideoX-5b/transformer",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "transformer/diffusion_pytorch_model.safetensors.index.json",
+                "models/CogVideo/CogVideoX-5b/transformer",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "transformer/diffusion_pytorch_model-00001-of-00002.safetensors",
+                "models/CogVideo/CogVideoX-5b/transformer",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "transformer/diffusion_pytorch_model-00002-of-00002.safetensors",
+                "models/CogVideo/CogVideoX-5b/transformer",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "vae/diffusion_pytorch_model.safetensors",
+                "models/CogVideo/CogVideoX-5b/vae",
+            ),
+        ],
+        "load_path": [
+            "models/CogVideo/CogVideoX-5b/text_encoder",
+            "models/CogVideo/CogVideoX-5b/transformer",
+            "models/CogVideo/CogVideoX-5b/vae/diffusion_pytorch_model.safetensors",
+        ],
+    },
+    # Stable Diffusion 3.5
+    "StableDiffusion3.5-large": [
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "sd3.5_large.safetensors",
+            "models/stable_diffusion_3",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/clip_l.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/clip_g.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/t5xxl_fp16.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+    ],
+    "StableDiffusion3.5-medium": [
+        (
+            "AI-ModelScope/stable-diffusion-3.5-medium",
+            "sd3.5_medium.safetensors",
+            "models/stable_diffusion_3",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/clip_l.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/clip_g.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/t5xxl_fp16.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+    ],
+    "StableDiffusion3.5-large-turbo": [
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large-turbo",
+            "sd3.5_large_turbo.safetensors",
+            "models/stable_diffusion_3",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/clip_l.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/clip_g.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/t5xxl_fp16.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+    ],
+    "HunyuanVideo": {
+        "file_list": [
+            (
+                "AI-ModelScope/clip-vit-large-patch14",
+                "model.safetensors",
+                "models/HunyuanVideo/text_encoder",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00001-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00002-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00003-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00004-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "config.json",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model.safetensors.index.json",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/HunyuanVideo",
+                "hunyuan-video-t2v-720p/vae/pytorch_model.pt",
+                "models/HunyuanVideo/vae",
+            ),
+            (
+                "AI-ModelScope/HunyuanVideo",
+                "hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt",
+                "models/HunyuanVideo/transformers",
+            ),
+        ],
+        "load_path": [
+            "models/HunyuanVideo/text_encoder/model.safetensors",
+            "models/HunyuanVideo/text_encoder_2",
+            "models/HunyuanVideo/vae/pytorch_model.pt",
+            "models/HunyuanVideo/transformers/mp_rank_00_model_states.pt",
+        ],
+    },
+    "HunyuanVideo-fp8": {
+        "file_list": [
+            (
+                "AI-ModelScope/clip-vit-large-patch14",
+                "model.safetensors",
+                "models/HunyuanVideo/text_encoder",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00001-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00002-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00003-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00004-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "config.json",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model.safetensors.index.json",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/HunyuanVideo",
+                "hunyuan-video-t2v-720p/vae/pytorch_model.pt",
+                "models/HunyuanVideo/vae",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo-safetensors",
+                "model.fp8.safetensors",
+                "models/HunyuanVideo/transformers",
+            ),
+        ],
+        "load_path": [
+            "models/HunyuanVideo/text_encoder/model.safetensors",
+            "models/HunyuanVideo/text_encoder_2",
+            "models/HunyuanVideo/vae/pytorch_model.pt",
+            "models/HunyuanVideo/transformers/model.fp8.safetensors",
+        ],
+    },
+}
+Preset_model_id: TypeAlias = Literal[
+    "HunyuanDiT",
+    "stable-video-diffusion-img2vid-xt",
+    "ExVideo-SVD-128f-v1",
+    "ExVideo-CogVideoX-LoRA-129f-v1",
+    "StableDiffusion_v15",
+    "DreamShaper_8",
+    "AingDiffusion_v12",
+    "Flat2DAnimerge_v45Sharp",
+    "TextualInversion_VeryBadImageNegative_v1.3",
+    "StableDiffusionXL_v1",
+    "BluePencilXL_v200",
+    "StableDiffusionXL_Turbo",
+    "ControlNet_v11f1p_sd15_depth",
+    "ControlNet_v11p_sd15_softedge",
+    "ControlNet_v11f1e_sd15_tile",
+    "ControlNet_v11p_sd15_lineart",
+    "AnimateDiff_v2",
+    "AnimateDiff_xl_beta",
+    "RIFE",
+    "BeautifulPrompt",
+    "opus-mt-zh-en",
+    "IP-Adapter-SD",
+    "IP-Adapter-SDXL",
+    "StableDiffusion3",
+    "StableDiffusion3_without_T5",
+    "Kolors",
+    "SDXL-vae-fp16-fix",
+    "ControlNet_union_sdxl_promax",
+    "FLUX.1-dev",
+    "FLUX.1-schnell",
+    "InstantX/FLUX.1-dev-Controlnet-Union-alpha",
+    "jasperai/Flux.1-dev-Controlnet-Depth",
+    "jasperai/Flux.1-dev-Controlnet-Surface-Normals",
+    "jasperai/Flux.1-dev-Controlnet-Upscaler",
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha",
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta",
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Depth",
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro",
+    "InstantX/FLUX.1-dev-IP-Adapter",
+    "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0",
+    "QwenPrompt",
+    "OmostPrompt",
+    "ESRGAN_x4",
+    "RIFE",
+    "OmniGen-v1",
+    "CogVideoX-5B",
+    "Annotators:Depth",
+    "Annotators:Softedge",
+    "Annotators:Lineart",
+    "Annotators:Normal",
+    "Annotators:Openpose",
+    "StableDiffusion3.5-large",
+    "StableDiffusion3.5-medium",
+    "HunyuanVideo",
+    "HunyuanVideo-fp8",
+]

FantasyTalking/diffsynth/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .video import VideoData, save_frames, save_video

FantasyTalking/diffsynth/data/video.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import os
+import imageio
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+class LowMemoryVideo:
+    def __init__(self, file_name):
+        self.reader = imageio.get_reader(file_name)
+    def __len__(self):
+        return self.reader.count_frames()
+    def __getitem__(self, item):
+        return Image.fromarray(np.array(self.reader.get_data(item))).convert("RGB")
+    def __del__(self):
+        self.reader.close()
+def split_file_name(file_name):
+    result = []
+    number = -1
+    for i in file_name:
+        if ord(i) >= ord("0") and ord(i) <= ord("9"):
+            if number == -1:
+                number = 0
+            number = number * 10 + ord(i) - ord("0")
+        else:
+            if number != -1:
+                result.append(number)
+                number = -1
+            result.append(i)
+    if number != -1:
+        result.append(number)
+    result = tuple(result)
+    return result
+def search_for_images(folder):
+    file_list = [
+        i for i in os.listdir(folder) if i.endswith(".jpg") or i.endswith(".png")
+    ]
+    file_list = [(split_file_name(file_name), file_name) for file_name in file_list]
+    file_list = [i[1] for i in sorted(file_list)]
+    file_list = [os.path.join(folder, i) for i in file_list]
+    return file_list
+class LowMemoryImageFolder:
+    def __init__(self, folder, file_list=None):
+        if file_list is None:
+            self.file_list = search_for_images(folder)
+        else:
+            self.file_list = [
+                os.path.join(folder, file_name) for file_name in file_list
+            ]
+    def __len__(self):
+        return len(self.file_list)
+    def __getitem__(self, item):
+        return Image.open(self.file_list[item]).convert("RGB")
+    def __del__(self):
+        pass
+def crop_and_resize(image, height, width):
+    image = np.array(image)
+    image_height, image_width, _ = image.shape
+    if image_height / image_width < height / width:
+        croped_width = int(image_height / height * width)
+        left = (image_width - croped_width) // 2
+        image = image[:, left : left + croped_width]
+        image = Image.fromarray(image).resize((width, height))
+    else:
+        croped_height = int(image_width / width * height)
+        left = (image_height - croped_height) // 2
+        image = image[left : left + croped_height, :]
+        image = Image.fromarray(image).resize((width, height))
+    return image
+class VideoData:
+    def __init__(
+        self, video_file=None, image_folder=None, height=None, width=None, **kwargs
+    ):
+        if video_file is not None:
+            self.data_type = "video"
+            self.data = LowMemoryVideo(video_file, **kwargs)
+        elif image_folder is not None:
+            self.data_type = "images"
+            self.data = LowMemoryImageFolder(image_folder, **kwargs)
+        else:
+            raise ValueError("Cannot open video or image folder")
+        self.length = None
+        self.set_shape(height, width)
+    def raw_data(self):
+        frames = []
+        for i in range(self.__len__()):
+            frames.append(self.__getitem__(i))
+        return frames
+    def set_length(self, length):
+        self.length = length
+    def set_shape(self, height, width):
+        self.height = height
+        self.width = width
+    def __len__(self):
+        if self.length is None:
+            return len(self.data)
+        else:
+            return self.length
+    def shape(self):
+        if self.height is not None and self.width is not None:
+            return self.height, self.width
+        else:
+            height, width, _ = self.__getitem__(0).shape
+            return height, width
+    def __getitem__(self, item):
+        frame = self.data.__getitem__(item)
+        width, height = frame.size
+        if self.height is not None and self.width is not None:
+            if self.height != height or self.width != width:
+                frame = crop_and_resize(frame, self.height, self.width)
+        return frame
+    def __del__(self):
+        pass
+    def save_images(self, folder):
+        os.makedirs(folder, exist_ok=True)
+        for i in tqdm(range(self.__len__()), desc="Saving images"):
+            frame = self.__getitem__(i)
+            frame.save(os.path.join(folder, f"{i}.png"))
+def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None):
+    writer = imageio.get_writer(
+        save_path, fps=fps, quality=quality, ffmpeg_params=ffmpeg_params
+    )
+    for frame in tqdm(frames, desc="Saving video"):
+        frame = np.array(frame)
+        writer.append_data(frame)
+    writer.close()
+# def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None):
+#     writer = imageio.get_writer(save_path, fps=fps, quality=quality, ffmpeg_params=["-crf", "0", "-preset", "veryslow"])
+#     for frame in tqdm(frames, desc="Saving video"):
+#         frame = np.array(frame)
+#         writer.append_data(frame)
+#     writer.close()
+# def save_video_h264(frames, save_path, fps, ffmpeg_params=None):
+#     import imageio.v3 as iio
+#     from tqdm import tqdm
+#     import numpy as np
+#     if ffmpeg_params is None:
+#         ffmpeg_params = ["-crf", "0", "-preset", "ultrafast"]  # 无损 H.264
+#     writer = iio.get_writer(save_path, fps=fps, codec="libx264", ffmpeg_params=ffmpeg_params)
+#     for frame in tqdm(frames, desc="Saving video"):
+#         writer.append_data(np.array(frame))
+#     writer.close()
+def save_frames(frames, save_path):
+    os.makedirs(save_path, exist_ok=True)
+    for i, frame in enumerate(tqdm(frames, desc="Saving images")):
+        frame.save(os.path.join(save_path, f"{i}.png"))
+if __name__ == "__main__":
+    frames = [
+        Image.fromarray(np.random.randint(0, 256, (512, 512, 3), dtype=np.uint8))
+        for i in range(81)
+    ]
+    save_video(frames, "haha.mp4", 23, 5)

FantasyTalking/diffsynth/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model_manager import *

FantasyTalking/diffsynth/models/downloader.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+import shutil
+from typing import List
+from huggingface_hub import hf_hub_download
+from modelscope import snapshot_download
+from typing_extensions import Literal, TypeAlias
+from ..configs.model_config import (Preset_model_id,
+                                    preset_models_on_huggingface,
+                                    preset_models_on_modelscope)
+def download_from_modelscope(model_id, origin_file_path, local_dir):
+    os.makedirs(local_dir, exist_ok=True)
+    file_name = os.path.basename(origin_file_path)
+    if file_name in os.listdir(local_dir):
+        print(f"    {file_name} has been already in {local_dir}.")
+    else:
+        print(f"    Start downloading {os.path.join(local_dir, file_name)}")
+        snapshot_download(
+            model_id, allow_file_pattern=origin_file_path, local_dir=local_dir
+        )
+        downloaded_file_path = os.path.join(local_dir, origin_file_path)
+        target_file_path = os.path.join(local_dir, os.path.split(origin_file_path)[-1])
+        if downloaded_file_path != target_file_path:
+            shutil.move(downloaded_file_path, target_file_path)
+            shutil.rmtree(os.path.join(local_dir, origin_file_path.split("/")[0]))
+def download_from_huggingface(model_id, origin_file_path, local_dir):
+    os.makedirs(local_dir, exist_ok=True)
+    file_name = os.path.basename(origin_file_path)
+    if file_name in os.listdir(local_dir):
+        print(f"    {file_name} has been already in {local_dir}.")
+    else:
+        print(f"    Start downloading {os.path.join(local_dir, file_name)}")
+        hf_hub_download(model_id, origin_file_path, local_dir=local_dir)
+        downloaded_file_path = os.path.join(local_dir, origin_file_path)
+        target_file_path = os.path.join(local_dir, file_name)
+        if downloaded_file_path != target_file_path:
+            shutil.move(downloaded_file_path, target_file_path)
+            shutil.rmtree(os.path.join(local_dir, origin_file_path.split("/")[0]))
+Preset_model_website: TypeAlias = Literal[
+    "HuggingFace",
+    "ModelScope",
+]
+website_to_preset_models = {
+    "HuggingFace": preset_models_on_huggingface,
+    "ModelScope": preset_models_on_modelscope,
+}
+website_to_download_fn = {
+    "HuggingFace": download_from_huggingface,
+    "ModelScope": download_from_modelscope,
+}
+def download_customized_models(
+    model_id,
+    origin_file_path,
+    local_dir,
+    downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"],
+):
+    downloaded_files = []
+    for website in downloading_priority:
+        # Check if the file is downloaded.
+        file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path))
+        if file_to_download in downloaded_files:
+            continue
+        # Download
+        website_to_download_fn[website](model_id, origin_file_path, local_dir)
+        if os.path.basename(origin_file_path) in os.listdir(local_dir):
+            downloaded_files.append(file_to_download)
+    return downloaded_files
+def download_models(
+    model_id_list: List[Preset_model_id] = [],
+    downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"],
+):
+    print(f"Downloading models: {model_id_list}")
+    downloaded_files = []
+    load_files = []
+    for model_id in model_id_list:
+        for website in downloading_priority:
+            if model_id in website_to_preset_models[website]:
+                # Parse model metadata
+                model_metadata = website_to_preset_models[website][model_id]
+                if isinstance(model_metadata, list):
+                    file_data = model_metadata
+                else:
+                    file_data = model_metadata.get("file_list", [])
+                # Try downloading the model from this website.
+                model_files = []
+                for model_id, origin_file_path, local_dir in file_data:
+                    # Check if the file is downloaded.
+                    file_to_download = os.path.join(
+                        local_dir, os.path.basename(origin_file_path)
+                    )
+                    if file_to_download in downloaded_files:
+                        continue
+                    # Download
+                    website_to_download_fn[website](
+                        model_id, origin_file_path, local_dir
+                    )
+                    if os.path.basename(origin_file_path) in os.listdir(local_dir):
+                        downloaded_files.append(file_to_download)
+                        model_files.append(file_to_download)
+                # If the model is successfully downloaded, break.
+                if len(model_files) > 0:
+                    if (
+                        isinstance(model_metadata, dict)
+                        and "load_path" in model_metadata
+                    ):
+                        model_files = model_metadata["load_path"]
+                    load_files.extend(model_files)
+                    break
+    return load_files

FantasyTalking/diffsynth/models/model_manager.py ADDED Viewed

	@@ -0,0 +1,582 @@

+import importlib
+import json
+import os
+from typing import List
+import torch
+from ..configs.model_config import (huggingface_model_loader_configs,
+                                    model_loader_configs,
+                                    patch_model_loader_configs)
+from .downloader import (Preset_model_id, Preset_model_website,
+                         download_customized_models, download_models)
+from .utils import (hash_state_dict_keys, init_weights_on_device,
+                    load_state_dict, split_state_dict_with_prefix)
+def load_model_from_single_file(
+    state_dict, model_names, model_classes, model_resource, torch_dtype, device
+):
+    loaded_model_names, loaded_models = [], []
+    for model_name, model_class in zip(model_names, model_classes):
+        print(f"    model_name: {model_name} model_class: {model_class.__name__}")
+        state_dict_converter = model_class.state_dict_converter()
+        if model_resource == "civitai":
+            state_dict_results = state_dict_converter.from_civitai(state_dict)
+        elif model_resource == "diffusers":
+            state_dict_results = state_dict_converter.from_diffusers(state_dict)
+        if isinstance(state_dict_results, tuple):
+            model_state_dict, extra_kwargs = state_dict_results
+            print(
+                f"        This model is initialized with extra kwargs: {extra_kwargs}"
+            )
+        else:
+            model_state_dict, extra_kwargs = state_dict_results, {}
+        torch_dtype = (
+            torch.float32
+            if extra_kwargs.get("upcast_to_float32", False)
+            else torch_dtype
+        )
+        with init_weights_on_device():
+            model = model_class(**extra_kwargs)
+        if hasattr(model, "eval"):
+            model = model.eval()
+        model.load_state_dict(model_state_dict, assign=True)
+        model = model.to(dtype=torch_dtype, device=device)
+        loaded_model_names.append(model_name)
+        loaded_models.append(model)
+    return loaded_model_names, loaded_models
+def load_model_from_huggingface_folder(
+    file_path, model_names, model_classes, torch_dtype, device
+):
+    loaded_model_names, loaded_models = [], []
+    for model_name, model_class in zip(model_names, model_classes):
+        if torch_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            model = model_class.from_pretrained(
+                file_path, torch_dtype=torch_dtype
+            ).eval()
+        else:
+            model = model_class.from_pretrained(file_path).eval().to(dtype=torch_dtype)
+        if torch_dtype == torch.float16 and hasattr(model, "half"):
+            model = model.half()
+        try:
+            model = model.to(device=device)
+        except:
+            pass
+        loaded_model_names.append(model_name)
+        loaded_models.append(model)
+    return loaded_model_names, loaded_models
+def load_single_patch_model_from_single_file(
+    state_dict, model_name, model_class, base_model, extra_kwargs, torch_dtype, device
+):
+    print(
+        f"    model_name: {model_name} model_class: {model_class.__name__} extra_kwargs: {extra_kwargs}"
+    )
+    base_state_dict = base_model.state_dict()
+    base_model.to("cpu")
+    del base_model
+    model = model_class(**extra_kwargs)
+    model.load_state_dict(base_state_dict, strict=False)
+    model.load_state_dict(state_dict, strict=False)
+    model.to(dtype=torch_dtype, device=device)
+    return model
+def load_patch_model_from_single_file(
+    state_dict,
+    model_names,
+    model_classes,
+    extra_kwargs,
+    model_manager,
+    torch_dtype,
+    device,
+):
+    loaded_model_names, loaded_models = [], []
+    for model_name, model_class in zip(model_names, model_classes):
+        while True:
+            for model_id in range(len(model_manager.model)):
+                base_model_name = model_manager.model_name[model_id]
+                if base_model_name == model_name:
+                    base_model_path = model_manager.model_path[model_id]
+                    base_model = model_manager.model[model_id]
+                    print(
+                        f"    Adding patch model to {base_model_name} ({base_model_path})"
+                    )
+                    patched_model = load_single_patch_model_from_single_file(
+                        state_dict,
+                        model_name,
+                        model_class,
+                        base_model,
+                        extra_kwargs,
+                        torch_dtype,
+                        device,
+                    )
+                    loaded_model_names.append(base_model_name)
+                    loaded_models.append(patched_model)
+                    model_manager.model.pop(model_id)
+                    model_manager.model_path.pop(model_id)
+                    model_manager.model_name.pop(model_id)
+                    break
+            else:
+                break
+    return loaded_model_names, loaded_models
+class ModelDetectorTemplate:
+    def __init__(self):
+        pass
+    def match(self, file_path="", state_dict={}):
+        return False
+    def load(
+        self,
+        file_path="",
+        state_dict={},
+        device="cuda",
+        torch_dtype=torch.float16,
+        **kwargs,
+    ):
+        return [], []
+class ModelDetectorFromSingleFile:
+    def __init__(self, model_loader_configs=[]):
+        self.keys_hash_with_shape_dict = {}
+        self.keys_hash_dict = {}
+        for metadata in model_loader_configs:
+            self.add_model_metadata(*metadata)
+    def add_model_metadata(
+        self,
+        keys_hash,
+        keys_hash_with_shape,
+        model_names,
+        model_classes,
+        model_resource,
+    ):
+        self.keys_hash_with_shape_dict[keys_hash_with_shape] = (
+            model_names,
+            model_classes,
+            model_resource,
+        )
+        if keys_hash is not None:
+            self.keys_hash_dict[keys_hash] = (
+                model_names,
+                model_classes,
+                model_resource,
+            )
+    def match(self, file_path="", state_dict={}):
+        if isinstance(file_path, str) and os.path.isdir(file_path):
+            return False
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            return True
+        keys_hash = hash_state_dict_keys(state_dict, with_shape=False)
+        if keys_hash in self.keys_hash_dict:
+            return True
+        return False
+    def load(
+        self,
+        file_path="",
+        state_dict={},
+        device="cuda",
+        torch_dtype=torch.float16,
+        **kwargs,
+    ):
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        # Load models with strict matching
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            model_names, model_classes, model_resource = self.keys_hash_with_shape_dict[
+                keys_hash_with_shape
+            ]
+            loaded_model_names, loaded_models = load_model_from_single_file(
+                state_dict,
+                model_names,
+                model_classes,
+                model_resource,
+                torch_dtype,
+                device,
+            )
+            return loaded_model_names, loaded_models
+        # Load models without strict matching
+        # (the shape of parameters may be inconsistent, and the state_dict_converter will modify the model architecture)
+        keys_hash = hash_state_dict_keys(state_dict, with_shape=False)
+        if keys_hash in self.keys_hash_dict:
+            model_names, model_classes, model_resource = self.keys_hash_dict[keys_hash]
+            loaded_model_names, loaded_models = load_model_from_single_file(
+                state_dict,
+                model_names,
+                model_classes,
+                model_resource,
+                torch_dtype,
+                device,
+            )
+            return loaded_model_names, loaded_models
+        return loaded_model_names, loaded_models
+class ModelDetectorFromSplitedSingleFile(ModelDetectorFromSingleFile):
+    def __init__(self, model_loader_configs=[]):
+        super().__init__(model_loader_configs)
+    def match(self, file_path="", state_dict={}):
+        if isinstance(file_path, str) and os.path.isdir(file_path):
+            return False
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        splited_state_dict = split_state_dict_with_prefix(state_dict)
+        for sub_state_dict in splited_state_dict:
+            if super().match(file_path, sub_state_dict):
+                return True
+        return False
+    def load(
+        self,
+        file_path="",
+        state_dict={},
+        device="cuda",
+        torch_dtype=torch.float16,
+        **kwargs,
+    ):
+        # Split the state_dict and load from each component
+        splited_state_dict = split_state_dict_with_prefix(state_dict)
+        valid_state_dict = {}
+        for sub_state_dict in splited_state_dict:
+            if super().match(file_path, sub_state_dict):
+                valid_state_dict.update(sub_state_dict)
+        if super().match(file_path, valid_state_dict):
+            loaded_model_names, loaded_models = super().load(
+                file_path, valid_state_dict, device, torch_dtype
+            )
+        else:
+            loaded_model_names, loaded_models = [], []
+            for sub_state_dict in splited_state_dict:
+                if super().match(file_path, sub_state_dict):
+                    loaded_model_names_, loaded_models_ = super().load(
+                        file_path, valid_state_dict, device, torch_dtype
+                    )
+                    loaded_model_names += loaded_model_names_
+                    loaded_models += loaded_models_
+        return loaded_model_names, loaded_models
+class ModelDetectorFromHuggingfaceFolder:
+    def __init__(self, model_loader_configs=[]):
+        self.architecture_dict = {}
+        for metadata in model_loader_configs:
+            self.add_model_metadata(*metadata)
+    def add_model_metadata(
+        self, architecture, huggingface_lib, model_name, redirected_architecture
+    ):
+        self.architecture_dict[architecture] = (
+            huggingface_lib,
+            model_name,
+            redirected_architecture,
+        )
+    def match(self, file_path="", state_dict={}):
+        if not isinstance(file_path, str) or os.path.isfile(file_path):
+            return False
+        file_list = os.listdir(file_path)
+        if "config.json" not in file_list:
+            return False
+        with open(os.path.join(file_path, "config.json"), "r") as f:
+            config = json.load(f)
+        if "architectures" not in config and "_class_name" not in config:
+            return False
+        return True
+    def load(
+        self,
+        file_path="",
+        state_dict={},
+        device="cuda",
+        torch_dtype=torch.float16,
+        **kwargs,
+    ):
+        with open(os.path.join(file_path, "config.json"), "r") as f:
+            config = json.load(f)
+        loaded_model_names, loaded_models = [], []
+        architectures = (
+            config["architectures"]
+            if "architectures" in config
+            else [config["_class_name"]]
+        )
+        for architecture in architectures:
+            (
+                huggingface_lib,
+                model_name,
+                redirected_architecture,
+            ) = self.architecture_dict[architecture]
+            if redirected_architecture is not None:
+                architecture = redirected_architecture
+            model_class = importlib.import_module(huggingface_lib).__getattribute__(
+                architecture
+            )
+            loaded_model_names_, loaded_models_ = load_model_from_huggingface_folder(
+                file_path, [model_name], [model_class], torch_dtype, device
+            )
+            loaded_model_names += loaded_model_names_
+            loaded_models += loaded_models_
+        return loaded_model_names, loaded_models
+class ModelDetectorFromPatchedSingleFile:
+    def __init__(self, model_loader_configs=[]):
+        self.keys_hash_with_shape_dict = {}
+        for metadata in model_loader_configs:
+            self.add_model_metadata(*metadata)
+    def add_model_metadata(
+        self, keys_hash_with_shape, model_name, model_class, extra_kwargs
+    ):
+        self.keys_hash_with_shape_dict[keys_hash_with_shape] = (
+            model_name,
+            model_class,
+            extra_kwargs,
+        )
+    def match(self, file_path="", state_dict={}):
+        if not isinstance(file_path, str) or os.path.isdir(file_path):
+            return False
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            return True
+        return False
+    def load(
+        self,
+        file_path="",
+        state_dict={},
+        device="cuda",
+        torch_dtype=torch.float16,
+        model_manager=None,
+        **kwargs,
+    ):
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        # Load models with strict matching
+        loaded_model_names, loaded_models = [], []
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            model_names, model_classes, extra_kwargs = self.keys_hash_with_shape_dict[
+                keys_hash_with_shape
+            ]
+            loaded_model_names_, loaded_models_ = load_patch_model_from_single_file(
+                state_dict,
+                model_names,
+                model_classes,
+                extra_kwargs,
+                model_manager,
+                torch_dtype,
+                device,
+            )
+            loaded_model_names += loaded_model_names_
+            loaded_models += loaded_models_
+        return loaded_model_names, loaded_models
+class ModelManager:
+    def __init__(
+        self,
+        torch_dtype=torch.float16,
+        device="cuda",
+        model_id_list: List[Preset_model_id] = [],
+        downloading_priority: List[Preset_model_website] = [
+            "ModelScope",
+            "HuggingFace",
+        ],
+        file_path_list: List[str] = [],
+    ):
+        self.torch_dtype = torch_dtype
+        self.device = device
+        self.model = []
+        self.model_path = []
+        self.model_name = []
+        downloaded_files = (
+            download_models(model_id_list, downloading_priority)
+            if len(model_id_list) > 0
+            else []
+        )
+        self.model_detector = [
+            ModelDetectorFromSingleFile(model_loader_configs),
+            ModelDetectorFromSplitedSingleFile(model_loader_configs),
+            ModelDetectorFromHuggingfaceFolder(huggingface_model_loader_configs),
+            ModelDetectorFromPatchedSingleFile(patch_model_loader_configs),
+        ]
+        self.load_models(downloaded_files + file_path_list)
+    def load_model_from_single_file(
+        self,
+        file_path="",
+        state_dict={},
+        model_names=[],
+        model_classes=[],
+        model_resource=None,
+    ):
+        print(f"Loading models from file: {file_path}")
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        model_names, models = load_model_from_single_file(
+            state_dict,
+            model_names,
+            model_classes,
+            model_resource,
+            self.torch_dtype,
+            self.device,
+        )
+        for model_name, model in zip(model_names, models):
+            self.model.append(model)
+            self.model_path.append(file_path)
+            self.model_name.append(model_name)
+        print(f"    The following models are loaded: {model_names}.")
+    def load_model_from_huggingface_folder(
+        self, file_path="", model_names=[], model_classes=[]
+    ):
+        print(f"Loading models from folder: {file_path}")
+        model_names, models = load_model_from_huggingface_folder(
+            file_path, model_names, model_classes, self.torch_dtype, self.device
+        )
+        for model_name, model in zip(model_names, models):
+            self.model.append(model)
+            self.model_path.append(file_path)
+            self.model_name.append(model_name)
+        print(f"    The following models are loaded: {model_names}.")
+    def load_patch_model_from_single_file(
+        self,
+        file_path="",
+        state_dict={},
+        model_names=[],
+        model_classes=[],
+        extra_kwargs={},
+    ):
+        print(f"Loading patch models from file: {file_path}")
+        model_names, models = load_patch_model_from_single_file(
+            state_dict,
+            model_names,
+            model_classes,
+            extra_kwargs,
+            self,
+            self.torch_dtype,
+            self.device,
+        )
+        for model_name, model in zip(model_names, models):
+            self.model.append(model)
+            self.model_path.append(file_path)
+            self.model_name.append(model_name)
+        print(f"    The following patched models are loaded: {model_names}.")
+    def load_lora(self, file_path="", state_dict={}, lora_alpha=1.0):
+        if isinstance(file_path, list):
+            for file_path_ in file_path:
+                self.load_lora(file_path_, state_dict=state_dict, lora_alpha=lora_alpha)
+        else:
+            print(f"Loading LoRA models from file: {file_path}")
+            if len(state_dict) == 0:
+                state_dict = load_state_dict(file_path)
+            for model_name, model, model_path in zip(
+                self.model_name, self.model, self.model_path
+            ):
+                for lora in get_lora_loaders():
+                    match_results = lora.match(model, state_dict)
+                    if match_results is not None:
+                        print(f"    Adding LoRA to {model_name} ({model_path}).")
+                        lora_prefix, model_resource = match_results
+                        lora.load(
+                            model,
+                            state_dict,
+                            lora_prefix,
+                            alpha=lora_alpha,
+                            model_resource=model_resource,
+                        )
+                        break
+    def load_model(self, file_path, model_names=None, device=None, torch_dtype=None):
+        print(f"Loading models from: {file_path}")
+        if device is None:
+            device = self.device
+        if torch_dtype is None:
+            torch_dtype = self.torch_dtype
+        if isinstance(file_path, list):
+            state_dict = {}
+            for path in file_path:
+                state_dict.update(load_state_dict(path))
+        elif os.path.isfile(file_path):
+            state_dict = load_state_dict(file_path)
+        else:
+            state_dict = None
+        for model_detector in self.model_detector:
+            if model_detector.match(file_path, state_dict):
+                model_names, models = model_detector.load(
+                    file_path,
+                    state_dict,
+                    device=device,
+                    torch_dtype=torch_dtype,
+                    allowed_model_names=model_names,
+                    model_manager=self,
+                )
+                for model_name, model in zip(model_names, models):
+                    self.model.append(model)
+                    self.model_path.append(file_path)
+                    self.model_name.append(model_name)
+                print(f"    The following models are loaded: {model_names}.")
+                break
+        else:
+            print(f"    We cannot detect the model type. No models are loaded.")
+    def load_models(
+        self, file_path_list, model_names=None, device=None, torch_dtype=None
+    ):
+        for file_path in file_path_list:
+            self.load_model(
+                file_path, model_names, device=device, torch_dtype=torch_dtype
+            )
+    def fetch_model(self, model_name, file_path=None, require_model_path=False):
+        fetched_models = []
+        fetched_model_paths = []
+        for model, model_path, model_name_ in zip(
+            self.model, self.model_path, self.model_name
+        ):
+            if file_path is not None and file_path != model_path:
+                continue
+            if model_name == model_name_:
+                fetched_models.append(model)
+                fetched_model_paths.append(model_path)
+        if len(fetched_models) == 0:
+            print(f"No {model_name} models available.")
+            return None
+        if len(fetched_models) == 1:
+            print(f"Using {model_name} from {fetched_model_paths[0]}.")
+        else:
+            print(
+                f"More than one {model_name} models are loaded in model manager: {fetched_model_paths}. Using {model_name} from {fetched_model_paths[0]}."
+            )
+        if require_model_path:
+            return fetched_models[0], fetched_model_paths[0]
+        else:
+            return fetched_models[0]
+    def to(self, device):
+        for model in self.model:
+            model.to(device)

FantasyTalking/diffsynth/models/utils.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import hashlib
+import os
+from contextlib import contextmanager
+import torch
+from safetensors import safe_open
+@contextmanager
+def init_weights_on_device(device=torch.device("meta"), include_buffers: bool = False):
+    old_register_parameter = torch.nn.Module.register_parameter
+    if include_buffers:
+        old_register_buffer = torch.nn.Module.register_buffer
+    def register_empty_parameter(module, name, param):
+        old_register_parameter(module, name, param)
+        if param is not None:
+            param_cls = type(module._parameters[name])
+            kwargs = module._parameters[name].__dict__
+            kwargs["requires_grad"] = param.requires_grad
+            module._parameters[name] = param_cls(
+                module._parameters[name].to(device), **kwargs
+            )
+    def register_empty_buffer(module, name, buffer, persistent=True):
+        old_register_buffer(module, name, buffer, persistent=persistent)
+        if buffer is not None:
+            module._buffers[name] = module._buffers[name].to(device)
+    def patch_tensor_constructor(fn):
+        def wrapper(*args, **kwargs):
+            kwargs["device"] = device
+            return fn(*args, **kwargs)
+        return wrapper
+    if include_buffers:
+        tensor_constructors_to_patch = {
+            torch_function_name: getattr(torch, torch_function_name)
+            for torch_function_name in ["empty", "zeros", "ones", "full"]
+        }
+    else:
+        tensor_constructors_to_patch = {}
+    try:
+        torch.nn.Module.register_parameter = register_empty_parameter
+        if include_buffers:
+            torch.nn.Module.register_buffer = register_empty_buffer
+        for torch_function_name in tensor_constructors_to_patch.keys():
+            setattr(
+                torch,
+                torch_function_name,
+                patch_tensor_constructor(getattr(torch, torch_function_name)),
+            )
+        yield
+    finally:
+        torch.nn.Module.register_parameter = old_register_parameter
+        if include_buffers:
+            torch.nn.Module.register_buffer = old_register_buffer
+        for (
+            torch_function_name,
+            old_torch_function,
+        ) in tensor_constructors_to_patch.items():
+            setattr(torch, torch_function_name, old_torch_function)
+def load_state_dict_from_folder(file_path, torch_dtype=None):
+    state_dict = {}
+    for file_name in os.listdir(file_path):
+        if "." in file_name and file_name.split(".")[-1] in [
+            "safetensors",
+            "bin",
+            "ckpt",
+            "pth",
+            "pt",
+        ]:
+            state_dict.update(
+                load_state_dict(
+                    os.path.join(file_path, file_name), torch_dtype=torch_dtype
+                )
+            )
+    return state_dict
+def load_state_dict(file_path, torch_dtype=None):
+    if file_path.endswith(".safetensors"):
+        return load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype)
+    else:
+        return load_state_dict_from_bin(file_path, torch_dtype=torch_dtype)
+def load_state_dict_from_safetensors(file_path, torch_dtype=None):
+    state_dict = {}
+    with safe_open(file_path, framework="pt", device="cpu") as f:
+        for k in f.keys():
+            state_dict[k] = f.get_tensor(k)
+            if torch_dtype is not None:
+                state_dict[k] = state_dict[k].to(torch_dtype)
+    return state_dict
+def load_state_dict_from_bin(file_path, torch_dtype=None):
+    state_dict = torch.load(file_path, map_location="cpu", weights_only=True)
+    if torch_dtype is not None:
+        for i in state_dict:
+            if isinstance(state_dict[i], torch.Tensor):
+                state_dict[i] = state_dict[i].to(torch_dtype)
+    return state_dict
+def search_for_embeddings(state_dict):
+    embeddings = []
+    for k in state_dict:
+        if isinstance(state_dict[k], torch.Tensor):
+            embeddings.append(state_dict[k])
+        elif isinstance(state_dict[k], dict):
+            embeddings += search_for_embeddings(state_dict[k])
+    return embeddings
+def search_parameter(param, state_dict):
+    for name, param_ in state_dict.items():
+        if param.numel() == param_.numel():
+            if param.shape == param_.shape:
+                if torch.dist(param, param_) < 1e-3:
+                    return name
+            else:
+                if torch.dist(param.flatten(), param_.flatten()) < 1e-3:
+                    return name
+    return None
+def build_rename_dict(source_state_dict, target_state_dict, split_qkv=False):
+    matched_keys = set()
+    with torch.no_grad():
+        for name in source_state_dict:
+            rename = search_parameter(source_state_dict[name], target_state_dict)
+            if rename is not None:
+                print(f'"{name}": "{rename}",')
+                matched_keys.add(rename)
+            elif (
+                split_qkv
+                and len(source_state_dict[name].shape) >= 1
+                and source_state_dict[name].shape[0] % 3 == 0
+            ):
+                length = source_state_dict[name].shape[0] // 3
+                rename = []
+                for i in range(3):
+                    rename.append(
+                        search_parameter(
+                            source_state_dict[name][i * length : i * length + length],
+                            target_state_dict,
+                        )
+                    )
+                if None not in rename:
+                    print(f'"{name}": {rename},')
+                    for rename_ in rename:
+                        matched_keys.add(rename_)
+    for name in target_state_dict:
+        if name not in matched_keys:
+            print("Cannot find", name, target_state_dict[name].shape)
+def search_for_files(folder, extensions):
+    files = []
+    if os.path.isdir(folder):
+        for file in sorted(os.listdir(folder)):
+            files += search_for_files(os.path.join(folder, file), extensions)
+    elif os.path.isfile(folder):
+        for extension in extensions:
+            if folder.endswith(extension):
+                files.append(folder)
+                break
+    return files
+def convert_state_dict_keys_to_single_str(state_dict, with_shape=True):
+    keys = []
+    for key, value in state_dict.items():
+        if isinstance(key, str):
+            if isinstance(value, torch.Tensor):
+                if with_shape:
+                    shape = "_".join(map(str, list(value.shape)))
+                    keys.append(key + ":" + shape)
+                keys.append(key)
+            elif isinstance(value, dict):
+                keys.append(
+                    key
+                    + "|"
+                    + convert_state_dict_keys_to_single_str(
+                        value, with_shape=with_shape
+                    )
+                )
+    keys.sort()
+    keys_str = ",".join(keys)
+    return keys_str
+def split_state_dict_with_prefix(state_dict):
+    keys = sorted([key for key in state_dict if isinstance(key, str)])
+    prefix_dict = {}
+    for key in keys:
+        prefix = key if "." not in key else key.split(".")[0]
+        if prefix not in prefix_dict:
+            prefix_dict[prefix] = []
+        prefix_dict[prefix].append(key)
+    state_dicts = []
+    for prefix, keys in prefix_dict.items():
+        sub_state_dict = {key: state_dict[key] for key in keys}
+        state_dicts.append(sub_state_dict)
+    return state_dicts
+def hash_state_dict_keys(state_dict, with_shape=True):
+    keys_str = convert_state_dict_keys_to_single_str(state_dict, with_shape=with_shape)
+    keys_str = keys_str.encode(encoding="UTF-8")
+    return hashlib.md5(keys_str).hexdigest()

FantasyTalking/diffsynth/models/wan_video_dit.py ADDED Viewed

	@@ -0,0 +1,998 @@

+import math
+import torch
+import torch.amp as amp
+import torch.nn as nn
+from tqdm import tqdm
+from .utils import hash_state_dict_keys
+try:
+    import flash_attn_interface
+    FLASH_ATTN_3_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_3_AVAILABLE = False
+try:
+    import flash_attn
+    FLASH_ATTN_2_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_2_AVAILABLE = False
+try:
+    from sageattention import sageattn
+    SAGE_ATTN_AVAILABLE = True
+except ModuleNotFoundError:
+    SAGE_ATTN_AVAILABLE = False
+import warnings
+__all__ = ["WanModel"]
+def attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.0,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    version=None):
+    if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE:
+        x = flash_attention(
+            q=q,
+            k=k,
+            v=v,
+            q_lens=q_lens,
+            k_lens=k_lens,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            q_scale=q_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+            dtype=dtype,
+            version=version,)
+    elif FLASH_ATTN_2_AVAILABLE:
+        x = flash_attention(
+            q=q,
+            k=k,
+            v=v,
+            q_lens=q_lens,
+            k_lens=k_lens,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            q_scale=q_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+            dtype=dtype,
+            version=version,)
+    elif SAGE_ATTN_AVAILABLE:
+        q = q.unsqueeze(0).transpose(1, 2).to(dtype)
+        k = k.unsqueeze(0).transpose(1, 2).to(dtype)
+        v = v.unsqueeze(0).transpose(1, 2).to(dtype)
+        x = sageattn(q, k, v, dropout_p=dropout_p, is_causal=causal)
+        x = x.transpose(1, 2).contiguous()
+    else:
+        q = q.unsqueeze(0).transpose(1, 2).to(dtype)
+        k = k.unsqueeze(0).transpose(1, 2).to(dtype)
+        v = v.unsqueeze(0).transpose(1, 2).to(dtype)
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+        x = x.transpose(1, 2).contiguous()
+    # output
+    return x
+def flash_attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.0,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    version=None,
+):
+    """
+    q:              [B, Lq, Nq, C1].
+    k:              [B, Lk, Nk, C1].
+    v:              [B, Lk, Nk, C2]. Nq must be divisible by Nk.
+    q_lens:         [B].
+    k_lens:         [B].
+    dropout_p:      float. Dropout probability.
+    softmax_scale:  float. The scaling of QK^T before applying softmax.
+    causal:         bool. Whether to apply causal attention mask.
+    window_size:    (left right). If not (-1, -1), apply sliding window local attention.
+    deterministic:  bool. If True, slightly slower and uses more memory.
+    dtype:          torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16.
+    """
+    half_dtypes = (torch.float16, torch.bfloat16)
+    assert dtype in half_dtypes
+    assert q.device.type == "cuda" and q.size(-1) <= 256
+    # params
+    b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # preprocess query
+    if q_lens is None:
+        q = half(q.flatten(0, 1))
+        q_lens = torch.tensor([lq] * b, dtype=torch.int32).to(
+            device=q.device, non_blocking=True
+        )
+    else:
+        q = half(torch.cat([u[:v] for u, v in zip(q, q_lens)]))
+    # preprocess key, value
+    if k_lens is None:
+        k = half(k.flatten(0, 1))
+        v = half(v.flatten(0, 1))
+        k_lens = torch.tensor([lk] * b, dtype=torch.int32).to(
+            device=k.device, non_blocking=True
+        )
+    else:
+        k = half(torch.cat([u[:v] for u, v in zip(k, k_lens)]))
+        v = half(torch.cat([u[:v] for u, v in zip(v, k_lens)]))
+    q = q.to(v.dtype)
+    k = k.to(v.dtype)
+    if q_scale is not None:
+        q = q * q_scale
+    if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE:
+        warnings.warn(
+            "Flash attention 3 is not available, use flash attention 2 instead."
+        )
+    # apply attention
+    if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE:
+        # Note: dropout_p, window_size are not supported in FA3 now.
+        x = flash_attn_interface.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            seqused_q=None,
+            seqused_k=None,
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            deterministic=deterministic,
+        )[0].unflatten(0, (b, lq))
+    elif FLASH_ATTN_2_AVAILABLE:
+        x = flash_attn.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+        ).unflatten(0, (b, lq))
+    elif SAGE_ATTN_AVAILABLE:
+        q = q.unsqueeze(0).transpose(1, 2).to(dtype)
+        k = k.unsqueeze(0).transpose(1, 2).to(dtype)
+        v = v.unsqueeze(0).transpose(1, 2).to(dtype)
+        x = sageattn(q, k, v, dropout_p=dropout_p, is_causal=causal)
+        x = x.transpose(1, 2).contiguous()
+    else:
+        q = q.unsqueeze(0).transpose(1, 2).to(dtype)
+        k = k.unsqueeze(0).transpose(1, 2).to(dtype)
+        v = v.unsqueeze(0).transpose(1, 2).to(dtype)
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+        x = x.transpose(1, 2).contiguous()
+    # output
+    return x.type(out_dtype)
+def create_sdpa_mask(q, k, q_lens, k_lens, causal=False):
+    b, lq, lk = q.size(0), q.size(1), k.size(1)
+    if q_lens is None:
+        q_lens = torch.tensor([lq] * b, dtype=torch.int32)
+    if k_lens is None:
+        k_lens = torch.tensor([lk] * b, dtype=torch.int32)
+    attn_mask = torch.zeros((b, lq, lk), dtype=torch.bool)
+    for i in range(b):
+        q_len, k_len = q_lens[i], k_lens[i]
+        attn_mask[i, q_len:, :] = True
+        attn_mask[i, :, k_len:] = True
+        if causal:
+            causal_mask = torch.triu(torch.ones((lq, lk), dtype=torch.bool), diagonal=1)
+            attn_mask[i, :, :] = torch.logical_or(attn_mask[i, :, :], causal_mask)
+    attn_mask = attn_mask.logical_not().to(q.device, non_blocking=True)
+    return attn_mask
+def attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.0,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    fa_version=None,
+):
+    if FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE:
+        return flash_attention(
+            q=q,
+            k=k,
+            v=v,
+            q_lens=q_lens,
+            k_lens=k_lens,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            q_scale=q_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+            dtype=dtype,
+            version=fa_version,
+        )
+    else:
+        if q_lens is not None or k_lens is not None:
+            warnings.warn(
+                "Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance."
+            )
+        attn_mask = None
+        q = q.transpose(1, 2).to(dtype)
+        k = k.transpose(1, 2).to(dtype)
+        v = v.transpose(1, 2).to(dtype)
+        out = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p
+        )
+        out = out.transpose(1, 2).contiguous()
+        return out
+def sinusoidal_embedding_1d(dim, position):
+    # preprocess
+    assert dim % 2 == 0
+    half = dim // 2
+    position = position.type(torch.float64)
+    # calculation
+    sinusoid = torch.outer(
+        position, torch.pow(10000, -torch.arange(half).to(position).div(half))
+    )
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x
+@amp.autocast(enabled=False, device_type="cuda")
+def rope_params(max_seq_len, dim, theta=10000):
+    assert dim % 2 == 0
+    freqs = torch.outer(
+        torch.arange(max_seq_len),
+        1.0 / torch.pow(theta, torch.arange(0, dim, 2).to(torch.float64).div(dim)),
+    )
+    freqs = torch.polar(torch.ones_like(freqs), freqs)
+    return freqs
+@amp.autocast(enabled=False, device_type="cuda")
+def rope_apply(x, grid_sizes, freqs):
+    n, c = x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    # loop over samples
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+        # precompute multipliers
+        x_i = torch.view_as_complex(
+            x[i, :seq_len].to(torch.float64).reshape(seq_len, n, -1, 2)
+        )
+        freqs_i = torch.cat(
+            [
+                freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+                freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+                freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
+            ],
+            dim=-1,
+        ).reshape(seq_len, 1, -1)
+        # apply rotary embedding
+        x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
+        x_i = torch.cat([x_i, x[i, seq_len:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output).float()
+class WanRMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        return self._norm(x.float()).type_as(x) * self.weight
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+class WanLayerNorm(nn.LayerNorm):
+    def __init__(self, dim, eps=1e-6, elementwise_affine=False):
+        super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps)
+    def forward(self, x):
+        return super().forward(x.float()).type_as(x)
+class WanSelfAttention(nn.Module):
+    def __init__(self, dim, num_heads, window_size=(-1, -1), qk_norm=True, eps=1e-6):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.eps = eps
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+        self.norm_k = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+    def forward(self, x, seq_lens, grid_sizes, freqs):
+        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+        # query, key, value function
+        def qkv_fn(x):
+            q = self.norm_q(self.q(x)).view(b, s, n, d)
+            k = self.norm_k(self.k(x)).view(b, s, n, d)
+            v = self.v(x).view(b, s, n, d)
+            return q, k, v
+        q, k, v = qkv_fn(x)
+        x = flash_attention(
+            q=rope_apply(q, grid_sizes, freqs),
+            k=rope_apply(k, grid_sizes, freqs),
+            v=v,
+            k_lens=seq_lens,
+            window_size=self.window_size,
+        )
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+class WanT2VCrossAttention(WanSelfAttention):
+    def forward(self, x, context, context_lens):
+        """
+        x:              [B, L1, C].
+        context:        [B, L2, C].
+        context_lens:   [B].
+        """
+        b, n, d = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.norm_q(self.q(x)).view(b, -1, n, d)
+        k = self.norm_k(self.k(context)).view(b, -1, n, d)
+        v = self.v(context).view(b, -1, n, d)
+        # compute attention
+        x = flash_attention(q, k, v, k_lens=context_lens)
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+class WanI2VCrossAttentionProcessor:
+    def __call__(self, attn, x, context, context_lens) -> torch.Tensor:
+        """
+        x:              [B, L1, C].
+        context:        [B, L2, C].
+        context_lens:   [B].
+        """
+        context_img = context[:, :257]
+        context = context[:, 257:]
+        b, n, d = x.size(0), attn.num_heads, attn.head_dim
+        # compute query, key, value
+        q = attn.norm_q(attn.q(x)).view(b, -1, n, d)
+        k = attn.norm_k(attn.k(context)).view(b, -1, n, d)
+        v = attn.v(context).view(b, -1, n, d)
+        k_img = attn.norm_k_img(attn.k_img(context_img)).view(b, -1, n, d)
+        v_img = attn.v_img(context_img).view(b, -1, n, d)
+        img_x = flash_attention(q, k_img, v_img, k_lens=None)
+        # compute attention
+        x = flash_attention(q, k, v, k_lens=context_lens)
+        # output
+        x = x.flatten(2)
+        img_x = img_x.flatten(2)
+        x = x + img_x
+        x = attn.o(x)
+        return x
+class WanI2VCrossAttention(WanSelfAttention):
+    def __init__(self, dim, num_heads, window_size=(-1, -1), qk_norm=True, eps=1e-6):
+        super().__init__(dim, num_heads, window_size, qk_norm, eps)
+        self.k_img = nn.Linear(dim, dim)
+        self.v_img = nn.Linear(dim, dim)
+        # self.alpha = nn.Parameter(torch.zeros((1, )))
+        self.norm_k_img = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+        processor = WanI2VCrossAttentionProcessor()
+        self.set_processor(processor)
+    def set_processor(self, processor) -> None:
+        self.processor = processor
+    def get_processor(self):
+        return self.processor
+    def forward(
+        self,
+        x,
+        context,
+        context_lens,
+        audio_proj,
+        audio_context_lens,
+        latents_num_frames,
+        audio_scale: float = 1.0,
+        **kwargs,
+    ):
+        """
+        x:              [B, L1, C].
+        context:        [B, L2, C].
+        context_lens:   [B].
+        """
+        if audio_proj is None:
+            return self.processor(self, x, context, context_lens)
+        else:
+            return self.processor(
+                self,
+                x,
+                context,
+                context_lens,
+                audio_proj,
+                audio_context_lens,
+                latents_num_frames,
+                audio_scale,
+            )
+WANX_CROSSATTENTION_CLASSES = {
+    "t2v_cross_attn": WanT2VCrossAttention,
+    "i2v_cross_attn": WanI2VCrossAttention,
+}
+class WanAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        cross_attn_type,
+        dim,
+        ffn_dim,
+        num_heads,
+        window_size=(-1, -1),
+        qk_norm=True,
+        cross_attn_norm=False,
+        eps=1e-6,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        # layers
+        self.norm1 = WanLayerNorm(dim, eps)
+        self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm, eps)
+        self.norm3 = (
+            WanLayerNorm(dim, eps, elementwise_affine=True)
+            if cross_attn_norm
+            else nn.Identity()
+        )
+        self.cross_attn = WANX_CROSSATTENTION_CLASSES[cross_attn_type](
+            dim, num_heads, (-1, -1), qk_norm, eps
+        )
+        self.norm2 = WanLayerNorm(dim, eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(ffn_dim, dim),
+        )
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+    def forward(
+        self,
+        x,
+        e,
+        seq_lens,
+        grid_sizes,
+        freqs,
+        context,
+        context_lens,
+        **kwargs,
+    ):
+        assert e.dtype == torch.float32
+        with amp.autocast(dtype=torch.float32, device_type="cuda"):
+            e = (self.modulation.to(dtype=e.dtype, device=e.device) + e).chunk(6, dim=1)
+        assert e[0].dtype == torch.float32
+        # self-attention
+        y = self.self_attn(
+            self.norm1(x).float() * (1 + e[1]) + e[0], seq_lens, grid_sizes, freqs
+        )
+        with amp.autocast(dtype=torch.float32, device_type="cuda"):
+            x = x + y * e[2]
+        # cross-attention & ffn function
+        def cross_attn_ffn(x, context, context_lens, e, **kwargs):
+            x = x + self.cross_attn(self.norm3(x), context, context_lens, **kwargs)
+            y = self.ffn(self.norm2(x).float() * (1 + e[4]) + e[3])
+            with amp.autocast(dtype=torch.float32, device_type="cuda"):
+                x = x + y * e[5]
+            return x
+        x = cross_attn_ffn(x, context, context_lens, e, **kwargs)
+        return x
+class Head(nn.Module):
+    def __init__(self, dim, out_dim, patch_size, eps=1e-6):
+        super().__init__()
+        self.dim = dim
+        self.out_dim = out_dim
+        self.patch_size = patch_size
+        self.eps = eps
+        # layers
+        out_dim = math.prod(patch_size) * out_dim
+        self.norm = WanLayerNorm(dim, eps)
+        self.head = nn.Linear(dim, out_dim)
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
+    def forward(self, x, e):
+        assert e.dtype == torch.float32
+        with amp.autocast(dtype=torch.float32, device_type="cuda"):
+            e = (
+                self.modulation.to(dtype=e.dtype, device=e.device) + e.unsqueeze(1)
+            ).chunk(2, dim=1)
+            x = self.head(self.norm(x) * (1 + e[1]) + e[0])
+        return x
+class MLPProj(torch.nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        self.proj = torch.nn.Sequential(
+            torch.nn.LayerNorm(in_dim),
+            torch.nn.Linear(in_dim, in_dim),
+            torch.nn.GELU(),
+            torch.nn.Linear(in_dim, out_dim),
+            torch.nn.LayerNorm(out_dim),
+        )
+    def forward(self, image_embeds):
+        clip_extra_context_tokens = self.proj(image_embeds)
+        return clip_extra_context_tokens
+class WanModel(nn.Module):
+    def __init__(
+        self,
+        model_type="t2v",
+        patch_size=(1, 2, 2),
+        text_len=512,
+        in_dim=16,
+        dim=2048,
+        ffn_dim=8192,
+        freq_dim=256,
+        text_dim=4096,
+        out_dim=16,
+        num_heads=16,
+        num_layers=32,
+        window_size=(-1, -1),
+        qk_norm=True,
+        cross_attn_norm=False,
+        eps=1e-6,
+    ):
+        super().__init__()
+        assert model_type in ["t2v", "i2v"]
+        self.model_type = model_type
+        self.patch_size = patch_size
+        self.text_len = text_len
+        self.in_dim = in_dim
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.freq_dim = freq_dim
+        self.text_dim = text_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        # embeddings
+        self.patch_embedding = nn.Conv3d(
+            in_dim, dim, kernel_size=patch_size, stride=patch_size
+        )
+        self.text_embedding = nn.Sequential(
+            nn.Linear(text_dim, dim), nn.GELU(approximate="tanh"), nn.Linear(dim, dim)
+        )
+        self.time_embedding = nn.Sequential(
+            nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim)
+        )
+        self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6))
+        # blocks
+        cross_attn_type = "t2v_cross_attn" if model_type == "t2v" else "i2v_cross_attn"
+        self.blocks = nn.ModuleList(
+            [
+                WanAttentionBlock(
+                    cross_attn_type,
+                    dim,
+                    ffn_dim,
+                    num_heads,
+                    window_size,
+                    qk_norm,
+                    cross_attn_norm,
+                    eps,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # head
+        self.head = Head(dim, out_dim, patch_size, eps)
+        # buffers (don't use register_buffer otherwise dtype will be changed in to())
+        assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
+        d = dim // num_heads
+        self.freqs = torch.cat(
+            [
+                rope_params(1024, d - 4 * (d // 6)),
+                rope_params(1024, 2 * (d // 6)),
+                rope_params(1024, 2 * (d // 6)),
+            ],
+            dim=1,
+        )
+        if model_type == "i2v":
+            self.img_emb = MLPProj(1280, dim)
+        # initialize weights
+        self.init_weights()
+    def forward(
+        self,
+        x,
+        timestep,
+        context,
+        seq_len,
+        clip_fea=None,
+        y=None,
+        use_gradient_checkpointing=False,
+        audio_proj=None,
+        audio_context_lens=None,
+        latents_num_frames=None,
+        audio_scale=1.0,
+        **kwargs,
+    ):
+        """
+        x:              A list of videos each with shape [C, T, H, W].
+        t:              [B].
+        context:        A list of text embeddings each with shape [L, C].
+        """
+        if self.model_type == "i2v":
+            assert clip_fea is not None and y is not None
+        # params
+        device = x[0].device
+        if self.freqs.device != device:
+            self.freqs = self.freqs.to(device)
+        if y is not None:
+            x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+        # embeddings
+        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+        grid_sizes = torch.stack(
+            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x]
+        )  # [B,2]
+        x = [u.flatten(2).transpose(1, 2) for u in x]  # [[C, L, T],,]
+        # print(f"x0.shape:{x[0].shape}")
+        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+        assert seq_lens.max() <= seq_len
+        x = torch.cat(
+            [
+                torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
+                for u in x
+            ]
+        )
+        # time embeddings
+        with amp.autocast(dtype=torch.float32, device_type="cuda"):
+            e = self.time_embedding(
+                sinusoidal_embedding_1d(self.freq_dim, timestep).float()
+            )
+            e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+            assert e.dtype == torch.float32 and e0.dtype == torch.float32
+        # context
+        context_lens = None
+        context = self.text_embedding(
+            torch.stack(
+                [
+                    torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+                    for u in context
+                ]
+            )
+        )
+        if clip_fea is not None:
+            context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+            context = torch.concat([context_clip, context], dim=1)
+        # arguments
+        kwargs = dict(
+            e=e0,
+            seq_lens=seq_lens,
+            grid_sizes=grid_sizes,
+            freqs=self.freqs,
+            context=context,
+            context_lens=context_lens,
+            audio_proj=audio_proj,
+            audio_context_lens=audio_context_lens,
+            latents_num_frames=latents_num_frames,
+            audio_scale=audio_scale,
+        )
+        def create_custom_forward(module):
+            def custom_forward(*inputs, **kwargs):
+                return module(*inputs, **kwargs)
+            return custom_forward
+        for block in self.blocks:
+            if self.training and use_gradient_checkpointing:
+                x = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    x,
+                    **kwargs,
+                    use_reentrant=False,
+                )
+            else:
+                x = block(x, **kwargs)
+        # head
+        x = self.head(x, e)
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        x = torch.stack(x).float()
+        return x
+    def unpatchify(self, x, grid_sizes):
+        c = self.out_dim
+        out = []
+        for u, v in zip(x, grid_sizes.tolist()):
+            u = u[: math.prod(v)].view(*v, *self.patch_size, c)
+            u = torch.einsum("fhwpqrc->cfphqwr", u)
+            u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
+            out.append(u)
+        return out
+    def init_weights(self):
+        # basic init
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+        # init embeddings
+        nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1))
+        for m in self.text_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.02)
+        for m in self.time_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.02)
+        # init output layer
+        nn.init.zeros_(self.head.head.weight)
+    @staticmethod
+    def state_dict_converter():
+        return WanModelStateDictConverter()
+    @property
+    def attn_processors(
+        self,
+    ):  # copy from https://github.com/XLabs-AI/x-flux/blob/main/src/flux/model.py
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor):
+        r"""copy from https://github.com/XLabs-AI/x-flux/blob/main/src/flux/model.py
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+class WanModelStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        if hash_state_dict_keys(state_dict) == "9269f8db9040a9d860eaca435be61814":
+            config = {
+                "model_type": "t2v",
+                "patch_size": (1, 2, 2),
+                "text_len": 512,
+                "in_dim": 16,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "window_size": (-1, -1),
+                "qk_norm": True,
+                "cross_attn_norm": True,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "aafcfd9672c3a2456dc46e1cb6e52c70":
+            config = {
+                "model_type": "t2v",
+                "patch_size": (1, 2, 2),
+                "text_len": 512,
+                "in_dim": 16,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "window_size": (-1, -1),
+                "qk_norm": True,
+                "cross_attn_norm": True,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "6bfcfb3b342cb286ce886889d519a77e":
+            config = {
+                "model_type": "i2v",
+                "patch_size": (1, 2, 2),
+                "text_len": 512,
+                "in_dim": 36,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "window_size": (-1, -1),
+                "qk_norm": True,
+                "cross_attn_norm": True,
+                "eps": 1e-6,
+            }
+        else:
+            config = {}
+        return state_dict, config

FantasyTalking/diffsynth/models/wan_video_image_encoder.py ADDED Viewed

	@@ -0,0 +1,960 @@

+"""
+Concise re-implementation of
+``https://github.com/openai/CLIP'' and
+``https://github.com/mlfoundations/open_clip''.
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as T
+from .wan_video_dit import flash_attention
+class SelfAttention(nn.Module):
+    def __init__(self, dim, num_heads, dropout=0.1, eps=1e-5):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.eps = eps
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask):
+        """
+        x:   [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        k = self.k(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        v = self.v(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        # compute attention
+        p = self.dropout.p if self.training else 0.0
+        x = F.scaled_dot_product_attention(q, k, v, mask, p)
+        x = x.permute(0, 2, 1, 3).reshape(b, s, c)
+        # output
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(self, dim, num_heads, post_norm, dropout=0.1, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.eps = eps
+        # layers
+        self.attn = SelfAttention(dim, num_heads, dropout, eps)
+        self.norm1 = nn.LayerNorm(dim, eps=eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, dim * 4),
+            nn.GELU(),
+            nn.Linear(dim * 4, dim),
+            nn.Dropout(dropout),
+        )
+        self.norm2 = nn.LayerNorm(dim, eps=eps)
+    def forward(self, x, mask):
+        if self.post_norm:
+            x = self.norm1(x + self.attn(x, mask))
+            x = self.norm2(x + self.ffn(x))
+        else:
+            x = x + self.attn(self.norm1(x), mask)
+            x = x + self.ffn(self.norm2(x))
+        return x
+class XLMRoberta(nn.Module):
+    """
+    XLMRobertaModel with no pooler and no LM head.
+    """
+    def __init__(
+        self,
+        vocab_size=250002,
+        max_seq_len=514,
+        type_size=1,
+        pad_id=1,
+        dim=1024,
+        num_heads=16,
+        num_layers=24,
+        post_norm=True,
+        dropout=0.1,
+        eps=1e-5,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.dim = dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.post_norm = post_norm
+        self.eps = eps
+        # embeddings
+        self.token_embedding = nn.Embedding(vocab_size, dim, padding_idx=pad_id)
+        self.type_embedding = nn.Embedding(type_size, dim)
+        self.pos_embedding = nn.Embedding(max_seq_len, dim, padding_idx=pad_id)
+        self.dropout = nn.Dropout(dropout)
+        # blocks
+        self.blocks = nn.ModuleList(
+            [
+                AttentionBlock(dim, num_heads, post_norm, dropout, eps)
+                for _ in range(num_layers)
+            ]
+        )
+        # norm layer
+        self.norm = nn.LayerNorm(dim, eps=eps)
+    def forward(self, ids):
+        """
+        ids: [B, L] of torch.LongTensor.
+        """
+        b, s = ids.shape
+        mask = ids.ne(self.pad_id).long()
+        # embeddings
+        x = (
+            self.token_embedding(ids)
+            + self.type_embedding(torch.zeros_like(ids))
+            + self.pos_embedding(self.pad_id + torch.cumsum(mask, dim=1) * mask)
+        )
+        if self.post_norm:
+            x = self.norm(x)
+        x = self.dropout(x)
+        # blocks
+        mask = torch.where(mask.view(b, 1, 1, s).gt(0), 0.0, torch.finfo(x.dtype).min)
+        for block in self.blocks:
+            x = block(x, mask)
+        # output
+        if not self.post_norm:
+            x = self.norm(x)
+        return x
+def xlm_roberta_large(pretrained=False, return_tokenizer=False, device="cpu", **kwargs):
+    """
+    XLMRobertaLarge adapted from Huggingface.
+    """
+    # params
+    cfg = dict(
+        vocab_size=250002,
+        max_seq_len=514,
+        type_size=1,
+        pad_id=1,
+        dim=1024,
+        num_heads=16,
+        num_layers=24,
+        post_norm=True,
+        dropout=0.1,
+        eps=1e-5,
+    )
+    cfg.update(**kwargs)
+    # init model
+    if pretrained:
+        from sora import DOWNLOAD_TO_CACHE
+        # init a meta model
+        with torch.device("meta"):
+            model = XLMRoberta(**cfg)
+        # load checkpoint
+        model.load_state_dict(
+            torch.load(
+                DOWNLOAD_TO_CACHE("models/xlm_roberta/xlm_roberta_large.pth"),
+                map_location=device,
+            ),
+            assign=True,
+        )
+    else:
+        # init a model on device
+        with torch.device(device):
+            model = XLMRoberta(**cfg)
+    # init tokenizer
+    if return_tokenizer:
+        from sora.data import HuggingfaceTokenizer
+        tokenizer = HuggingfaceTokenizer(
+            name="xlm-roberta-large", seq_len=model.text_len, clean="whitespace"
+        )
+        return model, tokenizer
+    else:
+        return model
+def pos_interpolate(pos, seq_len):
+    if pos.size(1) == seq_len:
+        return pos
+    else:
+        src_grid = int(math.sqrt(pos.size(1)))
+        tar_grid = int(math.sqrt(seq_len))
+        n = pos.size(1) - src_grid * src_grid
+        return torch.cat(
+            [
+                pos[:, :n],
+                F.interpolate(
+                    pos[:, n:]
+                    .float()
+                    .reshape(1, src_grid, src_grid, -1)
+                    .permute(0, 3, 1, 2),
+                    size=(tar_grid, tar_grid),
+                    mode="bicubic",
+                    align_corners=False,
+                )
+                .flatten(2)
+                .transpose(1, 2),
+            ],
+            dim=1,
+        )
+class QuickGELU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(1.702 * x)
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type_as(x)
+class SelfAttention(nn.Module):
+    def __init__(
+        self, dim, num_heads, causal=False, attn_dropout=0.0, proj_dropout=0.0
+    ):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.causal = causal
+        self.attn_dropout = attn_dropout
+        self.proj_dropout = proj_dropout
+        # layers
+        self.to_qkv = nn.Linear(dim, dim * 3)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x):
+        """
+        x:   [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).view(b, s, 3, n, d).unbind(2)
+        # compute attention
+        p = self.attn_dropout if self.training else 0.0
+        x = flash_attention(q, k, v, dropout_p=p, causal=self.causal, version=2)
+        x = x.reshape(b, s, c)
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        return x
+class SwiGLU(nn.Module):
+    def __init__(self, dim, mid_dim):
+        super().__init__()
+        self.dim = dim
+        self.mid_dim = mid_dim
+        # layers
+        self.fc1 = nn.Linear(dim, mid_dim)
+        self.fc2 = nn.Linear(dim, mid_dim)
+        self.fc3 = nn.Linear(mid_dim, dim)
+    def forward(self, x):
+        x = F.silu(self.fc1(x)) * self.fc2(x)
+        x = self.fc3(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        mlp_ratio,
+        num_heads,
+        post_norm=False,
+        causal=False,
+        activation="quick_gelu",
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        assert activation in ["quick_gelu", "gelu", "swi_glu"]
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.causal = causal
+        self.norm_eps = norm_eps
+        # layers
+        self.norm1 = LayerNorm(dim, eps=norm_eps)
+        self.attn = SelfAttention(dim, num_heads, causal, attn_dropout, proj_dropout)
+        self.norm2 = LayerNorm(dim, eps=norm_eps)
+        if activation == "swi_glu":
+            self.mlp = SwiGLU(dim, int(dim * mlp_ratio))
+        else:
+            self.mlp = nn.Sequential(
+                nn.Linear(dim, int(dim * mlp_ratio)),
+                QuickGELU() if activation == "quick_gelu" else nn.GELU(),
+                nn.Linear(int(dim * mlp_ratio), dim),
+                nn.Dropout(proj_dropout),
+            )
+    def forward(self, x):
+        if self.post_norm:
+            x = x + self.norm1(self.attn(x))
+            x = x + self.norm2(self.mlp(x))
+        else:
+            x = x + self.attn(self.norm1(x))
+            x = x + self.mlp(self.norm2(x))
+        return x
+class AttentionPool(nn.Module):
+    def __init__(
+        self,
+        dim,
+        mlp_ratio,
+        num_heads,
+        activation="gelu",
+        proj_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.proj_dropout = proj_dropout
+        self.norm_eps = norm_eps
+        # layers
+        gain = 1.0 / math.sqrt(dim)
+        self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.to_q = nn.Linear(dim, dim)
+        self.to_kv = nn.Linear(dim, dim * 2)
+        self.proj = nn.Linear(dim, dim)
+        self.norm = LayerNorm(dim, eps=norm_eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, int(dim * mlp_ratio)),
+            QuickGELU() if activation == "quick_gelu" else nn.GELU(),
+            nn.Linear(int(dim * mlp_ratio), dim),
+            nn.Dropout(proj_dropout),
+        )
+    def forward(self, x):
+        """
+        x:  [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.to_q(self.cls_embedding).view(1, 1, n, d).expand(b, -1, -1, -1)
+        k, v = self.to_kv(x).view(b, s, 2, n, d).unbind(2)
+        # compute attention
+        x = flash_attention(q, k, v, version=2)
+        x = x.reshape(b, 1, c)
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        # mlp
+        x = x + self.mlp(self.norm(x))
+        return x[:, 0]
+class VisionTransformer(nn.Module):
+    def __init__(
+        self,
+        image_size=224,
+        patch_size=16,
+        dim=768,
+        mlp_ratio=4,
+        out_dim=512,
+        num_heads=12,
+        num_layers=12,
+        pool_type="token",
+        pre_norm=True,
+        post_norm=False,
+        activation="quick_gelu",
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        if image_size % patch_size != 0:
+            print("[WARNING] image_size is not divisible by patch_size", flush=True)
+        assert pool_type in ("token", "token_fc", "attn_pool")
+        out_dim = out_dim or dim
+        super().__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = (image_size // patch_size) ** 2
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.pool_type = pool_type
+        self.post_norm = post_norm
+        self.norm_eps = norm_eps
+        # embeddings
+        gain = 1.0 / math.sqrt(dim)
+        self.patch_embedding = nn.Conv2d(
+            3, dim, kernel_size=patch_size, stride=patch_size, bias=not pre_norm
+        )
+        if pool_type in ("token", "token_fc"):
+            self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.pos_embedding = nn.Parameter(
+            gain
+            * torch.randn(
+                1,
+                self.num_patches + (1 if pool_type in ("token", "token_fc") else 0),
+                dim,
+            )
+        )
+        self.dropout = nn.Dropout(embedding_dropout)
+        # transformer
+        self.pre_norm = LayerNorm(dim, eps=norm_eps) if pre_norm else None
+        self.transformer = nn.Sequential(
+            *[
+                AttentionBlock(
+                    dim,
+                    mlp_ratio,
+                    num_heads,
+                    post_norm,
+                    False,
+                    activation,
+                    attn_dropout,
+                    proj_dropout,
+                    norm_eps,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.post_norm = LayerNorm(dim, eps=norm_eps)
+        # head
+        if pool_type == "token":
+            self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
+        elif pool_type == "token_fc":
+            self.head = nn.Linear(dim, out_dim)
+        elif pool_type == "attn_pool":
+            self.head = AttentionPool(
+                dim, mlp_ratio, num_heads, activation, proj_dropout, norm_eps
+            )
+    def forward(self, x, interpolation=False, use_31_block=False):
+        b = x.size(0)
+        # embeddings
+        x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)
+        if self.pool_type in ("token", "token_fc"):
+            x = torch.cat(
+                [
+                    self.cls_embedding.expand(b, -1, -1).to(
+                        dtype=x.dtype, device=x.device
+                    ),
+                    x,
+                ],
+                dim=1,
+            )
+        if interpolation:
+            e = pos_interpolate(self.pos_embedding, x.size(1))
+        else:
+            e = self.pos_embedding
+        e = e.to(dtype=x.dtype, device=x.device)
+        x = self.dropout(x + e)
+        if self.pre_norm is not None:
+            x = self.pre_norm(x)
+        # transformer
+        if use_31_block:
+            x = self.transformer[:-1](x)
+            return x
+        else:
+            x = self.transformer(x)
+            return x
+class CLIP(nn.Module):
+    def __init__(
+        self,
+        embed_dim=512,
+        image_size=224,
+        patch_size=16,
+        vision_dim=768,
+        vision_mlp_ratio=4,
+        vision_heads=12,
+        vision_layers=12,
+        vision_pool="token",
+        vision_pre_norm=True,
+        vision_post_norm=False,
+        vocab_size=49408,
+        text_len=77,
+        text_dim=512,
+        text_mlp_ratio=4,
+        text_heads=8,
+        text_layers=12,
+        text_causal=True,
+        text_pool="argmax",
+        text_head_bias=False,
+        logit_bias=None,
+        activation="quick_gelu",
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_mlp_ratio = vision_mlp_ratio
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vision_pool = vision_pool
+        self.vision_pre_norm = vision_pre_norm
+        self.vision_post_norm = vision_post_norm
+        self.vocab_size = vocab_size
+        self.text_len = text_len
+        self.text_dim = text_dim
+        self.text_mlp_ratio = text_mlp_ratio
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+        self.text_causal = text_causal
+        self.text_pool = text_pool
+        self.text_head_bias = text_head_bias
+        self.norm_eps = norm_eps
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            mlp_ratio=vision_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            pool_type=vision_pool,
+            pre_norm=vision_pre_norm,
+            post_norm=vision_post_norm,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps,
+        )
+        self.textual = TextTransformer(
+            vocab_size=vocab_size,
+            text_len=text_len,
+            dim=text_dim,
+            mlp_ratio=text_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=text_heads,
+            num_layers=text_layers,
+            causal=text_causal,
+            pool_type=text_pool,
+            head_bias=text_head_bias,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps,
+        )
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+        if logit_bias is not None:
+            self.logit_bias = nn.Parameter(logit_bias * torch.ones([]))
+        # initialize weights
+        self.init_weights()
+    def forward(self, imgs, txt_ids):
+        """
+        imgs:       [B, 3, H, W] of torch.float32.
+        - mean:     [0.48145466, 0.4578275, 0.40821073]
+        - std:      [0.26862954, 0.26130258, 0.27577711]
+        txt_ids:    [B, L] of torch.long. Encoded by data.CLIPTokenizer.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_ids)
+        return xi, xt
+    def init_weights(self):
+        # embeddings
+        nn.init.normal_(self.textual.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.visual.patch_embedding.weight, std=0.1)
+        # attentions
+        for modality in ["visual", "textual"]:
+            dim = self.vision_dim if modality == "visual" else self.text_dim
+            transformer = getattr(self, modality).transformer
+            proj_gain = (1.0 / math.sqrt(dim)) * (1.0 / math.sqrt(2 * len(transformer)))
+            attn_gain = 1.0 / math.sqrt(dim)
+            mlp_gain = 1.0 / math.sqrt(2.0 * dim)
+            for block in transformer:
+                nn.init.normal_(block.attn.to_qkv.weight, std=attn_gain)
+                nn.init.normal_(block.attn.proj.weight, std=proj_gain)
+                nn.init.normal_(block.mlp[0].weight, std=mlp_gain)
+                nn.init.normal_(block.mlp[2].weight, std=proj_gain)
+    def param_groups(self):
+        groups = [
+            {
+                "params": [
+                    p
+                    for n, p in self.named_parameters()
+                    if "norm" in n or n.endswith("bias")
+                ],
+                "weight_decay": 0.0,
+            },
+            {
+                "params": [
+                    p
+                    for n, p in self.named_parameters()
+                    if not ("norm" in n or n.endswith("bias"))
+                ]
+            },
+        ]
+        return groups
+class XLMRobertaWithHead(XLMRoberta):
+    def __init__(self, **kwargs):
+        self.out_dim = kwargs.pop("out_dim")
+        super().__init__(**kwargs)
+        # head
+        mid_dim = (self.dim + self.out_dim) // 2
+        self.head = nn.Sequential(
+            nn.Linear(self.dim, mid_dim, bias=False),
+            nn.GELU(),
+            nn.Linear(mid_dim, self.out_dim, bias=False),
+        )
+    def forward(self, ids):
+        # xlm-roberta
+        x = super().forward(ids)
+        # average pooling
+        mask = ids.ne(self.pad_id).unsqueeze(-1).to(x)
+        x = (x * mask).sum(dim=1) / mask.sum(dim=1)
+        # head
+        x = self.head(x)
+        return x
+class XLMRobertaCLIP(nn.Module):
+    def __init__(
+        self,
+        embed_dim=1024,
+        image_size=224,
+        patch_size=14,
+        vision_dim=1280,
+        vision_mlp_ratio=4,
+        vision_heads=16,
+        vision_layers=32,
+        vision_pool="token",
+        vision_pre_norm=True,
+        vision_post_norm=False,
+        activation="gelu",
+        vocab_size=250002,
+        max_text_len=514,
+        type_size=1,
+        pad_id=1,
+        text_dim=1024,
+        text_heads=16,
+        text_layers=24,
+        text_post_norm=True,
+        text_dropout=0.1,
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_mlp_ratio = vision_mlp_ratio
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vision_pre_norm = vision_pre_norm
+        self.vision_post_norm = vision_post_norm
+        self.activation = activation
+        self.vocab_size = vocab_size
+        self.max_text_len = max_text_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.text_dim = text_dim
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+        self.text_post_norm = text_post_norm
+        self.norm_eps = norm_eps
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            mlp_ratio=vision_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            pool_type=vision_pool,
+            pre_norm=vision_pre_norm,
+            post_norm=vision_post_norm,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps,
+        )
+        self.textual = None
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+    def forward(self, imgs, txt_ids):
+        """
+        imgs:       [B, 3, H, W] of torch.float32.
+        - mean:     [0.48145466, 0.4578275, 0.40821073]
+        - std:      [0.26862954, 0.26130258, 0.27577711]
+        txt_ids:    [B, L] of torch.long.
+                    Encoded by data.CLIPTokenizer.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_ids)
+        return xi, xt
+    def param_groups(self):
+        groups = [
+            {
+                "params": [
+                    p
+                    for n, p in self.named_parameters()
+                    if "norm" in n or n.endswith("bias")
+                ],
+                "weight_decay": 0.0,
+            },
+            {
+                "params": [
+                    p
+                    for n, p in self.named_parameters()
+                    if not ("norm" in n or n.endswith("bias"))
+                ]
+            },
+        ]
+        return groups
+def _clip(
+    pretrained=False,
+    pretrained_name=None,
+    model_cls=CLIP,
+    return_transforms=False,
+    return_tokenizer=False,
+    tokenizer_padding="eos",
+    dtype=torch.float32,
+    device="cpu",
+    **kwargs,
+):
+    # init model
+    if pretrained and pretrained_name:
+        from sora import BUCKET, DOWNLOAD_TO_CACHE
+        # init a meta model
+        with torch.device("meta"):
+            model = model_cls(**kwargs)
+        # checkpoint path
+        checkpoint = f"models/clip/{pretrained_name}"
+        if dtype in (torch.float16, torch.bfloat16):
+            suffix = "-" + {torch.float16: "fp16", torch.bfloat16: "bf16"}[dtype]
+            if object_exists(BUCKET, f"{checkpoint}{suffix}.pth"):
+                checkpoint = f"{checkpoint}{suffix}"
+        checkpoint += ".pth"
+        # load
+        model.load_state_dict(
+            torch.load(DOWNLOAD_TO_CACHE(checkpoint), map_location=device),
+            assign=True,
+            strict=False,
+        )
+    else:
+        # init a model on device
+        with torch.device(device):
+            model = model_cls(**kwargs)
+    # set device
+    output = (model,)
+    # init transforms
+    if return_transforms:
+        # mean and std
+        if "siglip" in pretrained_name.lower():
+            mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
+        else:
+            mean = [0.48145466, 0.4578275, 0.40821073]
+            std = [0.26862954, 0.26130258, 0.27577711]
+        # transforms
+        transforms = T.Compose(
+            [
+                T.Resize(
+                    (model.image_size, model.image_size),
+                    interpolation=T.InterpolationMode.BICUBIC,
+                ),
+                T.ToTensor(),
+                T.Normalize(mean=mean, std=std),
+            ]
+        )
+        output += (transforms,)
+    # init tokenizer
+    if return_tokenizer:
+        from sora import data
+        if "siglip" in pretrained_name.lower():
+            tokenizer = data.HuggingfaceTokenizer(
+                name=f"timm/{pretrained_name}",
+                seq_len=model.text_len,
+                clean="canonicalize",
+            )
+        elif "xlm" in pretrained_name.lower():
+            tokenizer = data.HuggingfaceTokenizer(
+                name="xlm-roberta-large",
+                seq_len=model.max_text_len - 2,
+                clean="whitespace",
+            )
+        elif "mba" in pretrained_name.lower():
+            tokenizer = data.HuggingfaceTokenizer(
+                name="facebook/xlm-roberta-xl",
+                seq_len=model.max_text_len - 2,
+                clean="whitespace",
+            )
+        else:
+            tokenizer = data.CLIPTokenizer(
+                seq_len=model.text_len, padding=tokenizer_padding
+            )
+        output += (tokenizer,)
+    return output[0] if len(output) == 1 else output
+def clip_xlm_roberta_vit_h_14(
+    pretrained=False,
+    pretrained_name="open-clip-xlm-roberta-large-vit-huge-14",
+    **kwargs,
+):
+    cfg = dict(
+        embed_dim=1024,
+        image_size=224,
+        patch_size=14,
+        vision_dim=1280,
+        vision_mlp_ratio=4,
+        vision_heads=16,
+        vision_layers=32,
+        vision_pool="token",
+        activation="gelu",
+        vocab_size=250002,
+        max_text_len=514,
+        type_size=1,
+        pad_id=1,
+        text_dim=1024,
+        text_heads=16,
+        text_layers=24,
+        text_post_norm=True,
+        text_dropout=0.1,
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0,
+    )
+    cfg.update(**kwargs)
+    return _clip(pretrained, pretrained_name, XLMRobertaCLIP, **cfg)
+class WanImageEncoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        # init model
+        self.model, self.transforms = clip_xlm_roberta_vit_h_14(
+            pretrained=False,
+            return_transforms=True,
+            return_tokenizer=False,
+            dtype=torch.float32,
+            device="cpu",
+        )
+    def encode_image(self, videos):
+        # preprocess
+        size = (self.model.image_size,) * 2
+        videos = torch.cat(
+            [
+                F.interpolate(u, size=size, mode="bicubic", align_corners=False)
+                for u in videos
+            ]
+        )
+        videos = self.transforms.transforms[-1](videos.mul_(0.5).add_(0.5))
+        # forward
+        out = self.model.visual(videos, use_31_block=True)
+        return out
+    @staticmethod
+    def state_dict_converter():
+        return WanImageEncoderStateDictConverter()
+class WanImageEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name.startswith("textual."):
+                continue
+            name = "model." + name
+            state_dict_[name] = param
+        return state_dict_

FantasyTalking/diffsynth/models/wan_video_text_encoder.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def fp16_clamp(x):
+    if x.dtype == torch.float16 and torch.isinf(x).any():
+        clamp = torch.finfo(x.dtype).max - 1000
+        x = torch.clamp(x, min=-clamp, max=clamp)
+    return x
+class GELU(nn.Module):
+    def forward(self, x):
+        return (
+            0.5
+            * x
+            * (
+                1.0
+                + torch.tanh(
+                    math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
+                )
+            )
+        )
+class T5LayerNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super(T5LayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            x = x.type_as(self.weight)
+        return self.weight * x
+class T5Attention(nn.Module):
+    def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
+        assert dim_attn % num_heads == 0
+        super(T5Attention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.num_heads = num_heads
+        self.head_dim = dim_attn // num_heads
+        # layers
+        self.q = nn.Linear(dim, dim_attn, bias=False)
+        self.k = nn.Linear(dim, dim_attn, bias=False)
+        self.v = nn.Linear(dim, dim_attn, bias=False)
+        self.o = nn.Linear(dim_attn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, context=None, mask=None, pos_bias=None):
+        """
+        x:          [B, L1, C].
+        context:    [B, L2, C] or None.
+        mask:       [B, L2] or [B, L1, L2] or None.
+        """
+        # check inputs
+        context = x if context is None else context
+        b, n, c = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).view(b, -1, n, c)
+        k = self.k(context).view(b, -1, n, c)
+        v = self.v(context).view(b, -1, n, c)
+        # attention bias
+        attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
+        if pos_bias is not None:
+            attn_bias += pos_bias
+        if mask is not None:
+            assert mask.ndim in [2, 3]
+            mask = mask.view(b, 1, 1, -1) if mask.ndim == 2 else mask.unsqueeze(1)
+            attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)
+        # compute attention (T5 does not use scaling)
+        attn = torch.einsum("binc,bjnc->bnij", q, k) + attn_bias
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        x = torch.einsum("bnij,bjnc->binc", attn, v)
+        # output
+        x = x.reshape(b, -1, n * c)
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class T5FeedForward(nn.Module):
+    def __init__(self, dim, dim_ffn, dropout=0.1):
+        super(T5FeedForward, self).__init__()
+        self.dim = dim
+        self.dim_ffn = dim_ffn
+        # layers
+        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
+        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
+        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.fc1(x) * self.gate(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class T5SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_attn,
+        dim_ffn,
+        num_heads,
+        num_buckets,
+        shared_pos=True,
+        dropout=0.1,
+    ):
+        super(T5SelfAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = (
+            None
+            if shared_pos
+            else T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True)
+        )
+    def forward(self, x, mask=None, pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(x.size(1), x.size(1))
+        x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.ffn(self.norm2(x)))
+        return x
+class T5RelativeEmbedding(nn.Module):
+    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
+        super(T5RelativeEmbedding, self).__init__()
+        self.num_buckets = num_buckets
+        self.num_heads = num_heads
+        self.bidirectional = bidirectional
+        self.max_dist = max_dist
+        # layers
+        self.embedding = nn.Embedding(num_buckets, num_heads)
+    def forward(self, lq, lk):
+        device = self.embedding.weight.device
+        # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
+        #     torch.arange(lq).unsqueeze(1).to(device)
+        rel_pos = torch.arange(lk, device=device).unsqueeze(0) - torch.arange(
+            lq, device=device
+        ).unsqueeze(1)
+        rel_pos = self._relative_position_bucket(rel_pos)
+        rel_pos_embeds = self.embedding(rel_pos)
+        rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(0)  # [1, N, Lq, Lk]
+        return rel_pos_embeds.contiguous()
+    def _relative_position_bucket(self, rel_pos):
+        # preprocess
+        if self.bidirectional:
+            num_buckets = self.num_buckets // 2
+            rel_buckets = (rel_pos > 0).long() * num_buckets
+            rel_pos = torch.abs(rel_pos)
+        else:
+            num_buckets = self.num_buckets
+            rel_buckets = 0
+            rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))
+        # embeddings for small and large positions
+        max_exact = num_buckets // 2
+        rel_pos_large = (
+            max_exact
+            + (
+                torch.log(rel_pos.float() / max_exact)
+                / math.log(self.max_dist / max_exact)
+                * (num_buckets - max_exact)
+            ).long()
+        )
+        rel_pos_large = torch.min(
+            rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1)
+        )
+        rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
+        return rel_buckets
+def init_weights(m):
+    if isinstance(m, T5LayerNorm):
+        nn.init.ones_(m.weight)
+    elif isinstance(m, T5FeedForward):
+        nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
+    elif isinstance(m, T5Attention):
+        nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn) ** -0.5)
+        nn.init.normal_(m.k.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.v.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn) ** -0.5)
+    elif isinstance(m, T5RelativeEmbedding):
+        nn.init.normal_(
+            m.embedding.weight, std=(2 * m.num_buckets * m.num_heads) ** -0.5
+        )
+class WanTextEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        vocab=256384,
+        dim=4096,
+        dim_attn=4096,
+        dim_ffn=10240,
+        num_heads=64,
+        num_layers=24,
+        num_buckets=32,
+        shared_pos=False,
+        dropout=0.1,
+    ):
+        super(WanTextEncoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.token_embedding = (
+            vocab if isinstance(vocab, nn.Embedding) else nn.Embedding(vocab, dim)
+        )
+        self.pos_embedding = (
+            T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True)
+            if shared_pos
+            else None
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList(
+            [
+                T5SelfAttention(
+                    dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos, dropout
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm = T5LayerNorm(dim)
+        # initialize weights
+        self.apply(init_weights)
+    def forward(self, ids, mask=None):
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1), x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+    @staticmethod
+    def state_dict_converter():
+        return WanTextEncoderStateDictConverter()
+class WanTextEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        return state_dict

FantasyTalking/diffsynth/models/wan_video_vae.py ADDED Viewed

	@@ -0,0 +1,948 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from tqdm import tqdm
+CACHE_T = 2
+def check_is_instance(model, module_class):
+    if isinstance(model, module_class):
+        return True
+    if hasattr(model, "module") and isinstance(model.module, module_class):
+        return True
+    return False
+def block_causal_mask(x, block_size):
+    # params
+    b, n, s, _, device = *x.size(), x.device
+    assert s % block_size == 0
+    num_blocks = s // block_size
+    # build mask
+    mask = torch.zeros(b, n, s, s, dtype=torch.bool, device=device)
+    for i in range(num_blocks):
+        mask[:, :, i * block_size : (i + 1) * block_size, : (i + 1) * block_size] = 1
+    return mask
+class CausalConv3d(nn.Conv3d):
+    """
+    Causal 3d convolusion.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (
+            self.padding[2],
+            self.padding[2],
+            self.padding[1],
+            self.padding[1],
+            2 * self.padding[0],
+            0,
+        )
+        self.padding = (0, 0, 0)
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+        return super().forward(x)
+class RMS_norm(nn.Module):
+    def __init__(self, dim, channel_first=True, images=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+    def forward(self, x):
+        return (
+            F.normalize(x, dim=(1 if self.channel_first else -1))
+            * self.scale
+            * self.gamma
+            + self.bias
+        )
+class Upsample(nn.Upsample):
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x.float()).type_as(x)
+class Resample(nn.Module):
+    def __init__(self, dim, mode):
+        assert mode in (
+            "none",
+            "upsample2d",
+            "upsample3d",
+            "downsample2d",
+            "downsample3d",
+        )
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim // 2, 3, padding=1),
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim // 2, 3, padding=1),
+            )
+            self.time_conv = CausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2))
+            )
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2))
+            )
+            self.time_conv = CausalConv3d(
+                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0)
+            )
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if (
+                        cache_x.shape[2] < 2
+                        and feat_cache[idx] is not None
+                        and feat_cache[idx] != "Rep"
+                    ):
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat(
+                            [
+                                feat_cache[idx][:, :, -1, :, :]
+                                .unsqueeze(2)
+                                .to(cache_x.device),
+                                cache_x,
+                            ],
+                            dim=2,
+                        )
+                    if (
+                        cache_x.shape[2] < 2
+                        and feat_cache[idx] is not None
+                        and feat_cache[idx] == "Rep"
+                    ):
+                        cache_x = torch.cat(
+                            [torch.zeros_like(cache_x).to(cache_x.device), cache_x],
+                            dim=2,
+                        )
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.resample(x)
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
+        if self.mode == "downsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(
+                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2)
+                    )
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+    def init_weight(self, conv):
+        conv_weight = conv.weight
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        conv_weight[: c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2 :, :, -1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+class ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1),
+        )
+        self.shortcut = (
+            CausalConv3d(in_dim, out_dim, 1) if in_dim != out_dim else nn.Identity()
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :]
+                            .unsqueeze(2)
+                            .to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x + h
+class AttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        # layers
+        self.norm = RMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+    def forward(self, x):
+        identity = x
+        b, c, t, h, w = x.size()
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.norm(x)
+        # compute query, key, value
+        q, k, v = (
+            self.to_qkv(x)
+            .reshape(b * t, 1, c * 3, -1)
+            .permute(0, 1, 3, 2)
+            .contiguous()
+            .chunk(3, dim=-1)
+        )
+        # apply attention
+        x = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            # attn_mask=block_causal_mask(q, block_size=h * w)
+        )
+        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
+        # output
+        x = self.proj(x)
+        x = rearrange(x, "(b t) c h w-> b c t h w", t=t)
+        return x + identity
+class Encoder3d(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            for _ in range(num_res_blocks):
+                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    downsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # downsample block
+            if i != len(dim_mult) - 1:
+                mode = "downsample3d" if temperal_downsample[i] else "downsample2d"
+                downsamples.append(Resample(out_dim, mode=mode))
+                scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(out_dim, out_dim, dropout),
+            AttentionBlock(out_dim),
+            ResidualBlock(out_dim, out_dim, dropout),
+        )
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(out_dim, z_dim, 3, padding=1),
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## middle
+        for layer in self.middle:
+            if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :]
+                            .unsqueeze(2)
+                            .to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+class Decoder3d(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_upsample=[False, True, True],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2 ** (len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout),
+            AttentionBlock(dims[0]),
+            ResidualBlock(dims[0], dims[0], dropout),
+        )
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i == 1 or i == 2 or i == 3:
+                in_dim = in_dim // 2
+            for _ in range(num_res_blocks + 1):
+                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    upsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # upsample block
+            if i != len(dim_mult) - 1:
+                mode = "upsample3d" if temperal_upsample[i] else "upsample2d"
+                upsamples.append(Resample(out_dim, mode=mode))
+                scale *= 2.0
+        self.upsamples = nn.Sequential(*upsamples)
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(out_dim, 3, 3, padding=1),
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## middle
+        for layer in self.middle:
+            if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :]
+                            .unsqueeze(2)
+                            .to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+def count_conv3d(model):
+    count = 0
+    for m in model.modules():
+        if check_is_instance(m, CausalConv3d):
+            count += 1
+    return count
+class VideoVAE_(nn.Module):
+    def __init__(
+        self,
+        dim=96,
+        z_dim=16,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[False, True, True],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        # modules
+        self.encoder = Encoder3d(
+            dim,
+            z_dim * 2,
+            dim_mult,
+            num_res_blocks,
+            attn_scales,
+            self.temperal_downsample,
+            dropout,
+        )
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d(
+            dim,
+            z_dim,
+            dim_mult,
+            num_res_blocks,
+            attn_scales,
+            self.temperal_upsample,
+            dropout,
+        )
+    def forward(self, x):
+        mu, log_var = self.encode(x)
+        z = self.reparameterize(mu, log_var)
+        x_recon = self.decode(z)
+        return x_recon, mu, log_var
+    def encode(self, x, scale):  # x: B, C, T, H, W
+        self.clear_cache()
+        ## cache
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(
+                    x[:, :, :1, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            scale = [s.to(dtype=mu.dtype, device=mu.device) for s in scale]
+            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
+                1, self.z_dim, 1, 1, 1
+            )
+        else:
+            scale = scale.to(dtype=mu.dtype, device=mu.device)
+            mu = (mu - scale[0]) * scale[1]
+        return mu
+    def decode(self, z, scale):
+        self.clear_cache()
+        # z: [b,c,t,h,w]
+        if isinstance(scale[0], torch.Tensor):
+            scale = [s.to(dtype=z.dtype, device=z.device) for s in scale]
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1
+            )
+        else:
+            scale = scale.to(dtype=z.dtype, device=z.device)
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i : i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                )
+            else:
+                out_ = self.decoder(
+                    x[:, :, i : i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                )
+                out = torch.cat([out, out_], 2)  # may add tensor offload
+        return out
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+    def sample(self, imgs, deterministic=False):
+        mu, log_var = self.encode(imgs)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std)
+    def clear_cache(self):
+        self._conv_num = count_conv3d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = count_conv3d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+class WanVideoVAE(nn.Module):
+    def __init__(self, z_dim=16):
+        super().__init__()
+        mean = [
+            -0.7571,
+            -0.7089,
+            -0.9113,
+            0.1075,
+            -0.1745,
+            0.9653,
+            -0.1517,
+            1.5508,
+            0.4134,
+            -0.0715,
+            0.5517,
+            -0.3632,
+            -0.1922,
+            -0.9497,
+            0.2503,
+            -0.2921,
+        ]
+        std = [
+            2.8184,
+            1.4541,
+            2.3275,
+            2.6558,
+            1.2196,
+            1.7708,
+            2.6052,
+            2.0743,
+            3.2687,
+            2.1526,
+            2.8652,
+            1.5579,
+            1.6382,
+            1.1253,
+            2.8251,
+            1.9160,
+        ]
+        self.mean = torch.tensor(mean)
+        self.std = torch.tensor(std)
+        self.scale = [self.mean, 1.0 / self.std]
+        # init model
+        self.model = VideoVAE_(z_dim=z_dim).eval().requires_grad_(False)
+        self.upsampling_factor = 8
+    def build_1d_mask(self, length, left_bound, right_bound, border_width):
+        x = torch.ones((length,))
+        if not left_bound:
+            x[:border_width] = (torch.arange(border_width) + 1) / border_width
+        if not right_bound:
+            x[-border_width:] = torch.flip(
+                (torch.arange(border_width) + 1) / border_width, dims=(0,)
+            )
+        return x
+    def build_mask(self, data, is_bound, border_width):
+        _, _, _, H, W = data.shape
+        h = self.build_1d_mask(H, is_bound[0], is_bound[1], border_width[0])
+        w = self.build_1d_mask(W, is_bound[2], is_bound[3], border_width[1])
+        h = repeat(h, "H -> H W", H=H, W=W)
+        w = repeat(w, "W -> H W", H=H, W=W)
+        mask = torch.stack([h, w]).min(dim=0).values
+        mask = rearrange(mask, "H W -> 1 1 1 H W")
+        return mask
+    def tiled_decode(self, hidden_states, device, tile_size, tile_stride):
+        _, _, T, H, W = hidden_states.shape
+        size_h, size_w = tile_size
+        stride_h, stride_w = tile_stride
+        # Split tasks
+        tasks = []
+        for h in range(0, H, stride_h):
+            if h - stride_h >= 0 and h - stride_h + size_h >= H:
+                continue
+            for w in range(0, W, stride_w):
+                if w - stride_w >= 0 and w - stride_w + size_w >= W:
+                    continue
+                h_, w_ = h + size_h, w + size_w
+                tasks.append((h, h_, w, w_))
+        data_device = "cpu"
+        computation_device = device
+        out_T = T * 4 - 3
+        weight = torch.zeros(
+            (1, 1, out_T, H * self.upsampling_factor, W * self.upsampling_factor),
+            dtype=hidden_states.dtype,
+            device=data_device,
+        )
+        values = torch.zeros(
+            (1, 3, out_T, H * self.upsampling_factor, W * self.upsampling_factor),
+            dtype=hidden_states.dtype,
+            device=data_device,
+        )
+        for h, h_, w, w_ in tqdm(tasks, desc="VAE decoding"):
+            hidden_states_batch = hidden_states[:, :, :, h:h_, w:w_].to(
+                computation_device
+            )
+            hidden_states_batch = self.model.decode(hidden_states_batch, self.scale).to(
+                data_device
+            )
+            mask = self.build_mask(
+                hidden_states_batch,
+                is_bound=(h == 0, h_ >= H, w == 0, w_ >= W),
+                border_width=(
+                    (size_h - stride_h) * self.upsampling_factor,
+                    (size_w - stride_w) * self.upsampling_factor,
+                ),
+            ).to(dtype=hidden_states.dtype, device=data_device)
+            target_h = h * self.upsampling_factor
+            target_w = w * self.upsampling_factor
+            values[
+                :,
+                :,
+                :,
+                target_h : target_h + hidden_states_batch.shape[3],
+                target_w : target_w + hidden_states_batch.shape[4],
+            ] += (
+                hidden_states_batch * mask
+            )
+            weight[
+                :,
+                :,
+                :,
+                target_h : target_h + hidden_states_batch.shape[3],
+                target_w : target_w + hidden_states_batch.shape[4],
+            ] += mask
+        values = values / weight
+        values = values.float().clamp_(-1, 1)
+        return values
+    def tiled_encode(self, video, device, tile_size, tile_stride):
+        _, _, T, H, W = video.shape
+        size_h, size_w = tile_size
+        stride_h, stride_w = tile_stride
+        # Split tasks
+        tasks = []
+        for h in range(0, H, stride_h):
+            if h - stride_h >= 0 and h - stride_h + size_h >= H:
+                continue
+            for w in range(0, W, stride_w):
+                if w - stride_w >= 0 and w - stride_w + size_w >= W:
+                    continue
+                h_, w_ = h + size_h, w + size_w
+                tasks.append((h, h_, w, w_))
+        data_device = "cpu"
+        computation_device = device
+        out_T = (T + 3) // 4
+        weight = torch.zeros(
+            (1, 1, out_T, H // self.upsampling_factor, W // self.upsampling_factor),
+            dtype=video.dtype,
+            device=data_device,
+        )
+        values = torch.zeros(
+            (1, 16, out_T, H // self.upsampling_factor, W // self.upsampling_factor),
+            dtype=video.dtype,
+            device=data_device,
+        )
+        for h, h_, w, w_ in tqdm(tasks, desc="VAE encoding"):
+            hidden_states_batch = video[:, :, :, h:h_, w:w_].to(computation_device)
+            hidden_states_batch = self.model.encode(hidden_states_batch, self.scale).to(
+                data_device
+            )
+            mask = self.build_mask(
+                hidden_states_batch,
+                is_bound=(h == 0, h_ >= H, w == 0, w_ >= W),
+                border_width=(
+                    (size_h - stride_h) // self.upsampling_factor,
+                    (size_w - stride_w) // self.upsampling_factor,
+                ),
+            ).to(dtype=video.dtype, device=data_device)
+            target_h = h // self.upsampling_factor
+            target_w = w // self.upsampling_factor
+            values[
+                :,
+                :,
+                :,
+                target_h : target_h + hidden_states_batch.shape[3],
+                target_w : target_w + hidden_states_batch.shape[4],
+            ] += (
+                hidden_states_batch * mask
+            )
+            weight[
+                :,
+                :,
+                :,
+                target_h : target_h + hidden_states_batch.shape[3],
+                target_w : target_w + hidden_states_batch.shape[4],
+            ] += mask
+        values = values / weight
+        values = values.float()
+        return values
+    def single_encode(self, video, device):
+        video = video.to(device)
+        x = self.model.encode(video, self.scale)
+        return x.float()
+    def single_decode(self, hidden_state, device):
+        hidden_state = hidden_state.to(device)
+        video = self.model.decode(hidden_state, self.scale)
+        return video.float().clamp_(-1, 1)
+    def encode(
+        self, videos, device, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)
+    ):
+        videos = [video.to("cpu") for video in videos]
+        hidden_states = []
+        for video in videos:
+            video = video.unsqueeze(0)
+            if tiled:
+                tile_size = (tile_size[0] * 8, tile_size[1] * 8)
+                tile_stride = (tile_stride[0] * 8, tile_stride[1] * 8)
+                hidden_state = self.tiled_encode(video, device, tile_size, tile_stride)
+            else:
+                hidden_state = self.single_encode(video, device)
+            hidden_state = hidden_state.squeeze(0)
+            hidden_states.append(hidden_state)
+        hidden_states = torch.stack(hidden_states)
+        return hidden_states
+    def decode(
+        self,
+        hidden_states,
+        device,
+        tiled=False,
+        tile_size=(34, 34),
+        tile_stride=(18, 16),
+    ):
+        hidden_states = [hidden_state.to("cpu") for hidden_state in hidden_states]
+        videos = []
+        for hidden_state in hidden_states:
+            hidden_state = hidden_state.unsqueeze(0)
+            if tiled:
+                video = self.tiled_decode(hidden_state, device, tile_size, tile_stride)
+            else:
+                video = self.single_decode(hidden_state, device)
+            video = video.squeeze(0)
+            videos.append(video)
+        videos = torch.stack(videos)
+        return videos
+    @staticmethod
+    def state_dict_converter():
+        return WanVideoVAEStateDictConverter()
+class WanVideoVAEStateDictConverter:
+    def __init__(self):
+        pass
+    def from_civitai(self, state_dict):
+        state_dict_ = {}
+        if "model_state" in state_dict:
+            state_dict = state_dict["model_state"]
+        for name in state_dict:
+            state_dict_["model." + name] = state_dict[name]
+        return state_dict_

FantasyTalking/diffsynth/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .wan_video import WanVideoPipeline

FantasyTalking/diffsynth/pipelines/base.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import numpy as np
+import torch
+from PIL import Image
+from torchvision.transforms import GaussianBlur
+class BasePipeline(torch.nn.Module):
+    def __init__(
+        self,
+        device="cuda",
+        torch_dtype=torch.float16,
+        height_division_factor=64,
+        width_division_factor=64,
+    ):
+        super().__init__()
+        self.device = device
+        self.torch_dtype = torch_dtype
+        self.height_division_factor = height_division_factor
+        self.width_division_factor = width_division_factor
+        self.cpu_offload = False
+        self.model_names = []
+    def check_resize_height_width(self, height, width):
+        if height % self.height_division_factor != 0:
+            height = (
+                (height + self.height_division_factor - 1)
+                // self.height_division_factor
+                * self.height_division_factor
+            )
+            print(
+                f"The height cannot be evenly divided by {self.height_division_factor}. We round it up to {height}."
+            )
+        if width % self.width_division_factor != 0:
+            width = (
+                (width + self.width_division_factor - 1)
+                // self.width_division_factor
+                * self.width_division_factor
+            )
+            print(
+                f"The width cannot be evenly divided by {self.width_division_factor}. We round it up to {width}."
+            )
+        return height, width
+    def preprocess_image(self, image):
+        image = (
+            torch.Tensor(np.array(image, dtype=np.float32) * (2 / 255) - 1)
+            .permute(2, 0, 1)
+            .unsqueeze(0)
+        )
+        return image
+    def preprocess_images(self, images):
+        return [self.preprocess_image(image) for image in images]
+    def vae_output_to_image(self, vae_output):
+        image = vae_output[0].cpu().float().permute(1, 2, 0).numpy()
+        image = Image.fromarray(((image / 2 + 0.5).clip(0, 1) * 255).astype("uint8"))
+        return image
+    def vae_output_to_video(self, vae_output):
+        video = vae_output.cpu().permute(1, 2, 0).numpy()
+        video = [
+            Image.fromarray(((image / 2 + 0.5).clip(0, 1) * 255).astype("uint8"))
+            for image in video
+        ]
+        return video
+    def merge_latents(
+        self, value, latents, masks, scales, blur_kernel_size=33, blur_sigma=10.0
+    ):
+        if len(latents) > 0:
+            blur = GaussianBlur(kernel_size=blur_kernel_size, sigma=blur_sigma)
+            height, width = value.shape[-2:]
+            weight = torch.ones_like(value)
+            for latent, mask, scale in zip(latents, masks, scales):
+                mask = (
+                    self.preprocess_image(mask.resize((width, height))).mean(
+                        dim=1, keepdim=True
+                    )
+                    > 0
+                )
+                mask = mask.repeat(1, latent.shape[1], 1, 1).to(
+                    dtype=latent.dtype, device=latent.device
+                )
+                mask = blur(mask)
+                value += latent * mask * scale
+                weight += mask * scale
+            value /= weight
+        return value
+    def control_noise_via_local_prompts(
+        self,
+        prompt_emb_global,
+        prompt_emb_locals,
+        masks,
+        mask_scales,
+        inference_callback,
+        special_kwargs=None,
+        special_local_kwargs_list=None,
+    ):
+        if special_kwargs is None:
+            noise_pred_global = inference_callback(prompt_emb_global)
+        else:
+            noise_pred_global = inference_callback(prompt_emb_global, special_kwargs)
+        if special_local_kwargs_list is None:
+            noise_pred_locals = [
+                inference_callback(prompt_emb_local)
+                for prompt_emb_local in prompt_emb_locals
+            ]
+        else:
+            noise_pred_locals = [
+                inference_callback(prompt_emb_local, special_kwargs)
+                for prompt_emb_local, special_kwargs in zip(
+                    prompt_emb_locals, special_local_kwargs_list
+                )
+            ]
+        noise_pred = self.merge_latents(
+            noise_pred_global, noise_pred_locals, masks, mask_scales
+        )
+        return noise_pred
+    def extend_prompt(self, prompt, local_prompts, masks, mask_scales):
+        local_prompts = local_prompts or []
+        masks = masks or []
+        mask_scales = mask_scales or []
+        extended_prompt_dict = self.prompter.extend_prompt(prompt)
+        prompt = extended_prompt_dict.get("prompt", prompt)
+        local_prompts += extended_prompt_dict.get("prompts", [])
+        masks += extended_prompt_dict.get("masks", [])
+        mask_scales += [100.0] * len(extended_prompt_dict.get("masks", []))
+        return prompt, local_prompts, masks, mask_scales
+    def enable_cpu_offload(self):
+        self.cpu_offload = True
+    def load_models_to_device(self, loadmodel_names=[]):
+        # only load models to device if cpu_offload is enabled
+        if not self.cpu_offload:
+            return
+        # offload the unneeded models to cpu
+        for model_name in self.model_names:
+            if model_name not in loadmodel_names:
+                model = getattr(self, model_name)
+                if model is not None:
+                    if (
+                        hasattr(model, "vram_management_enabled")
+                        and model.vram_management_enabled
+                    ):
+                        for module in model.modules():
+                            if hasattr(module, "offload"):
+                                module.offload()
+                    else:
+                        model.cpu()
+        # load the needed models to device
+        for model_name in loadmodel_names:
+            model = getattr(self, model_name)
+            if model is not None:
+                if (
+                    hasattr(model, "vram_management_enabled")
+                    and model.vram_management_enabled
+                ):
+                    for module in model.modules():
+                        if hasattr(module, "onload"):
+                            module.onload()
+                else:
+                    model.to(self.device)
+        # fresh the cuda cache
+        torch.cuda.empty_cache()
+    def generate_noise(self, shape, seed=None, device="cpu", dtype=torch.float16):
+        generator = None if seed is None else torch.Generator(device).manual_seed(seed)
+        noise = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        return noise

FantasyTalking/diffsynth/pipelines/wan_video.py ADDED Viewed

	@@ -0,0 +1,389 @@

+import os
+import numpy as np
+import torch
+from einops import rearrange
+from PIL import Image
+from tqdm import tqdm
+from ..models import ModelManager
+from ..models.wan_video_dit import WanLayerNorm, WanModel, WanRMSNorm
+from ..models.wan_video_image_encoder import WanImageEncoder
+from ..models.wan_video_text_encoder import (T5LayerNorm, T5RelativeEmbedding,
+                                             WanTextEncoder)
+from ..models.wan_video_vae import (CausalConv3d, RMS_norm, Upsample,
+                                    WanVideoVAE)
+from ..prompters import WanPrompter
+from ..schedulers.flow_match import FlowMatchScheduler
+from ..vram_management import (AutoWrappedLinear, AutoWrappedModule,
+                               enable_vram_management)
+from .base import BasePipeline
+class WanVideoPipeline(BasePipeline):
+    def __init__(self, device="cuda", torch_dtype=torch.float16, tokenizer_path=None):
+        super().__init__(device=device, torch_dtype=torch_dtype)
+        self.scheduler = FlowMatchScheduler(shift=5, sigma_min=0.0, extra_one_step=True)
+        self.prompter = WanPrompter(tokenizer_path=tokenizer_path)
+        self.text_encoder: WanTextEncoder = None
+        self.image_encoder: WanImageEncoder = None
+        self.dit: WanModel = None
+        self.vae: WanVideoVAE = None
+        self.model_names = ["text_encoder", "dit", "vae"]
+        self.height_division_factor = 16
+        self.width_division_factor = 16
+    def enable_vram_management(self, num_persistent_param_in_dit=None):
+        dtype = next(iter(self.text_encoder.parameters())).dtype
+        enable_vram_management(
+            self.text_encoder,
+            module_map={
+                torch.nn.Linear: AutoWrappedLinear,
+                torch.nn.Embedding: AutoWrappedModule,
+                T5RelativeEmbedding: AutoWrappedModule,
+                T5LayerNorm: AutoWrappedModule,
+            },
+            module_config=dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device="cpu",
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+        )
+        dtype = next(iter(self.dit.parameters())).dtype
+        enable_vram_management(
+            self.dit,
+            module_map={
+                torch.nn.Linear: AutoWrappedLinear,
+                torch.nn.Conv3d: AutoWrappedModule,
+                torch.nn.LayerNorm: AutoWrappedModule,
+                WanLayerNorm: AutoWrappedModule,
+                WanRMSNorm: AutoWrappedModule,
+            },
+            module_config=dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device=self.device,
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+            max_num_param=num_persistent_param_in_dit,
+            overflow_module_config=dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device="cpu",
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+        )
+        dtype = next(iter(self.vae.parameters())).dtype
+        enable_vram_management(
+            self.vae,
+            module_map={
+                torch.nn.Linear: AutoWrappedLinear,
+                torch.nn.Conv2d: AutoWrappedModule,
+                RMS_norm: AutoWrappedModule,
+                CausalConv3d: AutoWrappedModule,
+                Upsample: AutoWrappedModule,
+                torch.nn.SiLU: AutoWrappedModule,
+                torch.nn.Dropout: AutoWrappedModule,
+            },
+            module_config=dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device=self.device,
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+        )
+        if self.image_encoder is not None:
+            dtype = next(iter(self.image_encoder.parameters())).dtype
+            enable_vram_management(
+                self.image_encoder,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+            )
+        self.enable_cpu_offload()
+    def fetch_models(self, model_manager: ModelManager):
+        text_encoder_model_and_path = model_manager.fetch_model(
+            "wan_video_text_encoder", require_model_path=True
+        )
+        if text_encoder_model_and_path is not None:
+            self.text_encoder, tokenizer_path = text_encoder_model_and_path
+            self.prompter.fetch_models(self.text_encoder)
+            self.prompter.fetch_tokenizer(
+                os.path.join(os.path.dirname(tokenizer_path), "google/umt5-xxl")
+            )
+        self.dit = model_manager.fetch_model("wan_video_dit")
+        self.vae = model_manager.fetch_model("wan_video_vae")
+        self.image_encoder = model_manager.fetch_model("wan_video_image_encoder")
+    @staticmethod
+    def from_model_manager(model_manager: ModelManager, torch_dtype=None, device=None):
+        if device is None:
+            device = model_manager.device
+        if torch_dtype is None:
+            torch_dtype = model_manager.torch_dtype
+        pipe = WanVideoPipeline(device=device, torch_dtype=torch_dtype)
+        pipe.fetch_models(model_manager)
+        return pipe
+    def denoising_model(self):
+        return self.dit
+    def encode_prompt(self, prompt, positive=True):
+        prompt_emb = self.prompter.encode_prompt(prompt, positive=positive)
+        return {"context": prompt_emb}
+    def encode_image(self, image, num_frames, height, width):
+        with torch.amp.autocast(
+            dtype=torch.bfloat16, device_type=torch.device(self.device).type
+        ):
+            image = self.preprocess_image(image.resize((width, height))).to(self.device)
+            clip_context = self.image_encoder.encode_image([image])
+            msk = torch.ones(1, num_frames, height // 8, width // 8, device=self.device)
+            msk[:, 1:] = 0
+            msk = torch.concat(
+                [torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]],
+                dim=1,
+            )
+            msk = msk.view(1, msk.shape[1] // 4, 4, height // 8, width // 8)
+            msk = msk.transpose(1, 2)[0]
+            y = self.vae.encode(
+                [
+                    torch.concat(
+                        [
+                            image.transpose(0, 1),
+                            torch.zeros(3, num_frames - 1, height, width).to(
+                                image.device
+                            ),
+                        ],
+                        dim=1,
+                    )
+                ],
+                device=self.device,
+            )[0]
+            y = torch.concat([msk, y])
+        return {"clip_fea": clip_context, "y": [y]}
+    def tensor2video(self, frames):
+        frames = rearrange(frames, "C T H W -> T H W C")
+        frames = (
+            ((frames.float() + 1) * 127.5).clip(0, 255).cpu().numpy().astype(np.uint8)
+        )
+        frames = [Image.fromarray(frame) for frame in frames]
+        return frames
+    def prepare_extra_input(self, latents=None):
+        return {"seq_len": latents.shape[2] * latents.shape[3] * latents.shape[4] // 4}
+    def encode_video(
+        self, input_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)
+    ):
+        with torch.amp.autocast(
+            dtype=torch.bfloat16, device_type=torch.device(self.device).type
+        ):
+            latents = self.vae.encode(
+                input_video,
+                device=self.device,
+                tiled=tiled,
+                tile_size=tile_size,
+                tile_stride=tile_stride,
+            )
+        return latents
+    def decode_video(
+        self, latents, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)
+    ):
+        with torch.amp.autocast(
+            dtype=torch.bfloat16, device_type=torch.device(self.device).type
+        ):
+            frames = self.vae.decode(
+                latents,
+                device=self.device,
+                tiled=tiled,
+                tile_size=tile_size,
+                tile_stride=tile_stride,
+            )
+        return frames
+    def set_ip(self, local_path):
+        pass
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt,
+        negative_prompt="",
+        input_image=None,
+        input_video=None,
+        denoising_strength=1.0,
+        seed=None,
+        rand_device="cpu",
+        height=480,
+        width=832,
+        num_frames=81,
+        cfg_scale=5.0,
+        audio_cfg_scale=None,
+        num_inference_steps=50,
+        sigma_shift=5.0,
+        tiled=True,
+        tile_size=(30, 52),
+        tile_stride=(15, 26),
+        progress_bar_cmd=tqdm,
+        progress_bar_st=None,
+        **kwargs,
+    ):
+        # Parameter check
+        height, width = self.check_resize_height_width(height, width)
+        if num_frames % 4 != 1:
+            num_frames = (num_frames + 2) // 4 * 4 + 1
+            print(
+                f"Only `num_frames % 4 != 1` is acceptable. We round it up to {num_frames}."
+            )
+        # Tiler parameters
+        tiler_kwargs = {
+            "tiled": tiled,
+            "tile_size": tile_size,
+            "tile_stride": tile_stride,
+        }
+        # Scheduler
+        self.scheduler.set_timesteps(
+            num_inference_steps, denoising_strength, shift=sigma_shift
+        )
+        # Initialize noise
+        noise = self.generate_noise(
+            (1, 16, (num_frames - 1) // 4 + 1, height // 8, width // 8),
+            seed=seed,
+            device=rand_device,
+            dtype=torch.float32,
+        ).to(self.device)
+        if input_video is not None:
+            self.load_models_to_device(["vae"])
+            input_video = self.preprocess_images(input_video)
+            input_video = torch.stack(input_video, dim=2)
+            latents = self.encode_video(input_video, **tiler_kwargs).to(
+                dtype=noise.dtype, device=noise.device
+            )
+            latents = self.scheduler.add_noise(
+                latents, noise, timestep=self.scheduler.timesteps[0]
+            )
+        else:
+            latents = noise
+        # Encode prompts
+        self.load_models_to_device(["text_encoder"])
+        prompt_emb_posi = self.encode_prompt(prompt, positive=True)
+        if cfg_scale != 1.0:
+            prompt_emb_nega = self.encode_prompt(negative_prompt, positive=False)
+        # Encode image
+        if input_image is not None and self.image_encoder is not None:
+            self.load_models_to_device(["image_encoder", "vae"])
+            image_emb = self.encode_image(input_image, num_frames, height, width)
+        else:
+            image_emb = {}
+        # Extra input
+        extra_input = self.prepare_extra_input(latents)
+        # Denoise
+        self.load_models_to_device(["dit"])
+        with torch.amp.autocast(
+            dtype=torch.bfloat16, device_type=torch.device(self.device).type
+        ):
+            for progress_id, timestep in enumerate(
+                progress_bar_cmd(self.scheduler.timesteps)
+            ):
+                timestep = timestep.unsqueeze(0).to(
+                    dtype=torch.float32, device=self.device
+                )
+                # Inference
+                noise_pred_posi = self.dit(
+                    latents,
+                    timestep=timestep,
+                    **prompt_emb_posi,
+                    **image_emb,
+                    **extra_input,
+                    **kwargs,
+                )  # (zt,audio,prompt)
+                if audio_cfg_scale is not None:
+                    audio_scale = kwargs["audio_scale"]
+                    kwargs["audio_scale"] = 0.0
+                    noise_pred_noaudio = self.dit(
+                        latents,
+                        timestep=timestep,
+                        **prompt_emb_posi,
+                        **image_emb,
+                        **extra_input,
+                        **kwargs,
+                    )  # (zt,0,prompt)
+                    # kwargs['ip_scale'] = ip_scale
+                    if cfg_scale != 1.0:  # prompt cfg
+                        noise_pred_no_cond = self.dit(
+                            latents,
+                            timestep=timestep,
+                            **prompt_emb_nega,
+                            **image_emb,
+                            **extra_input,
+                            **kwargs,
+                        )  # (zt,0,0)
+                        noise_pred = (
+                            noise_pred_no_cond
+                            + cfg_scale * (noise_pred_noaudio - noise_pred_no_cond)
+                            + audio_cfg_scale * (noise_pred_posi - noise_pred_noaudio)
+                        )
+                    else:
+                        noise_pred = noise_pred_noaudio + audio_cfg_scale * (
+                            noise_pred_posi - noise_pred_noaudio
+                        )
+                    kwargs["audio_scale"] = audio_scale
+                else:
+                    if cfg_scale != 1.0:
+                        noise_pred_nega = self.dit(
+                            latents,
+                            timestep=timestep,
+                            **prompt_emb_nega,
+                            **image_emb,
+                            **extra_input,
+                            **kwargs,
+                        )  # (zt,audio,0)
+                        noise_pred = noise_pred_nega + cfg_scale * (
+                            noise_pred_posi - noise_pred_nega
+                        )
+                    else:
+                        noise_pred = noise_pred_posi
+                # Scheduler
+                latents = self.scheduler.step(
+                    noise_pred, self.scheduler.timesteps[progress_id], latents
+                )
+        # Decode
+        self.load_models_to_device(["vae"])
+        frames = self.decode_video(latents, **tiler_kwargs)
+        self.load_models_to_device([])
+        frames = self.tensor2video(frames[0])
+        return frames

FantasyTalking/diffsynth/prompters/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .wan_prompter import WanPrompter

FantasyTalking/diffsynth/prompters/base_prompter.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import torch
+from ..models.model_manager import ModelManager
+def tokenize_long_prompt(tokenizer, prompt, max_length=None):
+    # Get model_max_length from self.tokenizer
+    length = tokenizer.model_max_length if max_length is None else max_length
+    # To avoid the warning. set self.tokenizer.model_max_length to +oo.
+    tokenizer.model_max_length = 99999999
+    # Tokenize it!
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+    # Determine the real length.
+    max_length = (input_ids.shape[1] + length - 1) // length * length
+    # Restore tokenizer.model_max_length
+    tokenizer.model_max_length = length
+    # Tokenize it again with fixed length.
+    input_ids = tokenizer(
+        prompt,
+        return_tensors="pt",
+        padding="max_length",
+        max_length=max_length,
+        truncation=True,
+    ).input_ids
+    # Reshape input_ids to fit the text encoder.
+    num_sentence = input_ids.shape[1] // length
+    input_ids = input_ids.reshape((num_sentence, length))
+    return input_ids
+class BasePrompter:
+    def __init__(self):
+        self.refiners = []
+        self.extenders = []
+    def load_prompt_refiners(self, model_manager: ModelManager, refiner_classes=[]):
+        for refiner_class in refiner_classes:
+            refiner = refiner_class.from_model_manager(model_manager)
+            self.refiners.append(refiner)
+    def load_prompt_extenders(self, model_manager: ModelManager, extender_classes=[]):
+        for extender_class in extender_classes:
+            extender = extender_class.from_model_manager(model_manager)
+            self.extenders.append(extender)
+    @torch.no_grad()
+    def process_prompt(self, prompt, positive=True):
+        if isinstance(prompt, list):
+            prompt = [
+                self.process_prompt(prompt_, positive=positive) for prompt_ in prompt
+            ]
+        else:
+            for refiner in self.refiners:
+                prompt = refiner(prompt, positive=positive)
+        return prompt
+    @torch.no_grad()
+    def extend_prompt(self, prompt: str, positive=True):
+        extended_prompt = dict(prompt=prompt)
+        for extender in self.extenders:
+            extended_prompt = extender(extended_prompt)
+        return extended_prompt

FantasyTalking/diffsynth/prompters/wan_prompter.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import html
+import os
+import string
+import ftfy
+import regex as re
+import torch
+from transformers import AutoTokenizer
+from ..models.wan_video_text_encoder import WanTextEncoder
+from .base_prompter import BasePrompter
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+def canonicalize(text, keep_punctuation_exact_string=None):
+    text = text.replace("_", " ")
+    if keep_punctuation_exact_string:
+        text = keep_punctuation_exact_string.join(
+            part.translate(str.maketrans("", "", string.punctuation))
+            for part in text.split(keep_punctuation_exact_string)
+        )
+    else:
+        text = text.translate(str.maketrans("", "", string.punctuation))
+    text = text.lower()
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+class HuggingfaceTokenizer:
+    def __init__(self, name, seq_len=None, clean=None, **kwargs):
+        assert clean in (None, "whitespace", "lower", "canonicalize")
+        self.name = name
+        self.seq_len = seq_len
+        self.clean = clean
+        # init tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
+        self.vocab_size = self.tokenizer.vocab_size
+    def __call__(self, sequence, **kwargs):
+        return_mask = kwargs.pop("return_mask", False)
+        # arguments
+        _kwargs = {"return_tensors": "pt"}
+        if self.seq_len is not None:
+            _kwargs.update(
+                {
+                    "padding": "max_length",
+                    "truncation": True,
+                    "max_length": self.seq_len,
+                }
+            )
+        _kwargs.update(**kwargs)
+        # tokenization
+        if isinstance(sequence, str):
+            sequence = [sequence]
+        if self.clean:
+            sequence = [self._clean(u) for u in sequence]
+        ids = self.tokenizer(sequence, **_kwargs)
+        # output
+        if return_mask:
+            return ids.input_ids, ids.attention_mask
+        else:
+            return ids.input_ids
+    def _clean(self, text):
+        if self.clean == "whitespace":
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == "lower":
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == "canonicalize":
+            text = canonicalize(basic_clean(text))
+        return text
+class WanPrompter(BasePrompter):
+    def __init__(self, tokenizer_path=None, text_len=512):
+        super().__init__()
+        self.text_len = text_len
+        self.text_encoder = None
+        self.fetch_tokenizer(tokenizer_path)
+    def fetch_tokenizer(self, tokenizer_path=None):
+        if tokenizer_path is not None:
+            self.tokenizer = HuggingfaceTokenizer(
+                name=tokenizer_path, seq_len=self.text_len, clean="whitespace"
+            )
+    def fetch_models(self, text_encoder: WanTextEncoder = None):
+        self.text_encoder = text_encoder
+    def encode_prompt(self, prompt, positive=True, device="cuda"):
+        prompt = self.process_prompt(prompt, positive=positive)
+        ids, mask = self.tokenizer(prompt, return_mask=True, add_special_tokens=True)
+        ids = ids.to(device)
+        mask = mask.to(device)
+        seq_lens = mask.gt(0).sum(dim=1).long()
+        prompt_emb = self.text_encoder(ids, mask)
+        prompt_emb = [u[:v] for u, v in zip(prompt_emb, seq_lens)]
+        return prompt_emb

FantasyTalking/diffsynth/schedulers/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .continuous_ode import ContinuousODEScheduler
+from .ddim import EnhancedDDIMScheduler
+from .flow_match import FlowMatchScheduler

FantasyTalking/diffsynth/schedulers/continuous_ode.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import torch
+class ContinuousODEScheduler:
+    def __init__(
+        self, num_inference_steps=100, sigma_max=700.0, sigma_min=0.002, rho=7.0
+    ):
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self.rho = rho
+        self.set_timesteps(num_inference_steps)
+    def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0, **kwargs):
+        ramp = torch.linspace(1 - denoising_strength, 1, num_inference_steps)
+        min_inv_rho = torch.pow(torch.tensor((self.sigma_min,)), (1 / self.rho))
+        max_inv_rho = torch.pow(torch.tensor((self.sigma_max,)), (1 / self.rho))
+        self.sigmas = torch.pow(
+            max_inv_rho + ramp * (min_inv_rho - max_inv_rho), self.rho
+        )
+        self.timesteps = torch.log(self.sigmas) * 0.25
+    def step(self, model_output, timestep, sample, to_final=False):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample *= (sigma * sigma + 1).sqrt()
+        estimated_sample = (
+            -sigma / (sigma * sigma + 1).sqrt() * model_output
+            + 1 / (sigma * sigma + 1) * sample
+        )
+        if to_final or timestep_id + 1 >= len(self.timesteps):
+            prev_sample = estimated_sample
+        else:
+            sigma_ = self.sigmas[timestep_id + 1]
+            derivative = 1 / sigma * (sample - estimated_sample)
+            prev_sample = sample + derivative * (sigma_ - sigma)
+            prev_sample /= (sigma_ * sigma_ + 1).sqrt()
+        return prev_sample
+    def return_to_timestep(self, timestep, sample, sample_stablized):
+        # This scheduler doesn't support this function.
+        pass
+    def add_noise(self, original_samples, noise, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample = (original_samples + noise * sigma) / (sigma * sigma + 1).sqrt()
+        return sample
+    def training_target(self, sample, noise, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        target = (
+            -(sigma * sigma + 1).sqrt() / sigma + 1 / (sigma * sigma + 1).sqrt() / sigma
+        ) * sample + 1 / (sigma * sigma + 1).sqrt() * noise
+        return target
+    def training_weight(self, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        weight = (1 + sigma * sigma).sqrt() / sigma
+        return weight

FantasyTalking/diffsynth/schedulers/ddim.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import math
+import torch
+class EnhancedDDIMScheduler:
+    def __init__(
+        self,
+        num_train_timesteps=1000,
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        prediction_type="epsilon",
+        rescale_zero_terminal_snr=False,
+    ):
+        self.num_train_timesteps = num_train_timesteps
+        if beta_schedule == "scaled_linear":
+            betas = torch.square(
+                torch.linspace(
+                    math.sqrt(beta_start),
+                    math.sqrt(beta_end),
+                    num_train_timesteps,
+                    dtype=torch.float32,
+                )
+            )
+        elif beta_schedule == "linear":
+            betas = torch.linspace(
+                beta_start, beta_end, num_train_timesteps, dtype=torch.float32
+            )
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented")
+        self.alphas_cumprod = torch.cumprod(1.0 - betas, dim=0)
+        if rescale_zero_terminal_snr:
+            self.alphas_cumprod = self.rescale_zero_terminal_snr(self.alphas_cumprod)
+        self.alphas_cumprod = self.alphas_cumprod.tolist()
+        self.set_timesteps(10)
+        self.prediction_type = prediction_type
+    def rescale_zero_terminal_snr(self, alphas_cumprod):
+        alphas_bar_sqrt = alphas_cumprod.sqrt()
+        # Store old values.
+        alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+        alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+        # Shift so the last timestep is zero.
+        alphas_bar_sqrt -= alphas_bar_sqrt_T
+        # Scale so the first timestep is back to the old value.
+        alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+        # Convert alphas_bar_sqrt to betas
+        alphas_bar = alphas_bar_sqrt.square()  # Revert sqrt
+        return alphas_bar
+    def set_timesteps(self, num_inference_steps, denoising_strength=1.0, **kwargs):
+        # The timesteps are aligned to 999...0, which is different from other implementations,
+        # but I think this implementation is more reasonable in theory.
+        max_timestep = max(round(self.num_train_timesteps * denoising_strength) - 1, 0)
+        num_inference_steps = min(num_inference_steps, max_timestep + 1)
+        if num_inference_steps == 1:
+            self.timesteps = torch.Tensor([max_timestep])
+        else:
+            step_length = max_timestep / (num_inference_steps - 1)
+            self.timesteps = torch.Tensor(
+                [
+                    round(max_timestep - i * step_length)
+                    for i in range(num_inference_steps)
+                ]
+            )
+    def denoise(self, model_output, sample, alpha_prod_t, alpha_prod_t_prev):
+        if self.prediction_type == "epsilon":
+            weight_e = math.sqrt(1 - alpha_prod_t_prev) - math.sqrt(
+                alpha_prod_t_prev * (1 - alpha_prod_t) / alpha_prod_t
+            )
+            weight_x = math.sqrt(alpha_prod_t_prev / alpha_prod_t)
+            prev_sample = sample * weight_x + model_output * weight_e
+        elif self.prediction_type == "v_prediction":
+            weight_e = -math.sqrt(alpha_prod_t_prev * (1 - alpha_prod_t)) + math.sqrt(
+                alpha_prod_t * (1 - alpha_prod_t_prev)
+            )
+            weight_x = math.sqrt(alpha_prod_t * alpha_prod_t_prev) + math.sqrt(
+                (1 - alpha_prod_t) * (1 - alpha_prod_t_prev)
+            )
+            prev_sample = sample * weight_x + model_output * weight_e
+        else:
+            raise NotImplementedError(f"{self.prediction_type} is not implemented")
+        return prev_sample
+    def step(self, model_output, timestep, sample, to_final=False):
+        alpha_prod_t = self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        if to_final or timestep_id + 1 >= len(self.timesteps):
+            alpha_prod_t_prev = 1.0
+        else:
+            timestep_prev = int(self.timesteps[timestep_id + 1])
+            alpha_prod_t_prev = self.alphas_cumprod[timestep_prev]
+        return self.denoise(model_output, sample, alpha_prod_t, alpha_prod_t_prev)
+    def return_to_timestep(self, timestep, sample, sample_stablized):
+        alpha_prod_t = self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+        noise_pred = (sample - math.sqrt(alpha_prod_t) * sample_stablized) / math.sqrt(
+            1 - alpha_prod_t
+        )
+        return noise_pred
+    def add_noise(self, original_samples, noise, timestep):
+        sqrt_alpha_prod = math.sqrt(
+            self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+        )
+        sqrt_one_minus_alpha_prod = math.sqrt(
+            1 - self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+        )
+        noisy_samples = (
+            sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        )
+        return noisy_samples
+    def training_target(self, sample, noise, timestep):
+        if self.prediction_type == "epsilon":
+            return noise
+        else:
+            sqrt_alpha_prod = math.sqrt(
+                self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+            )
+            sqrt_one_minus_alpha_prod = math.sqrt(
+                1 - self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+            )
+            target = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+            return target
+    def training_weight(self, timestep):
+        return 1.0

FantasyTalking/diffsynth/schedulers/flow_match.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+class FlowMatchScheduler:
+    def __init__(
+        self,
+        num_inference_steps=100,
+        num_train_timesteps=1000,
+        shift=3.0,
+        sigma_max=1.0,
+        sigma_min=0.003 / 1.002,
+        inverse_timesteps=False,
+        extra_one_step=False,
+        reverse_sigmas=False,
+    ):
+        self.num_train_timesteps = num_train_timesteps
+        self.shift = shift
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self.inverse_timesteps = inverse_timesteps
+        self.extra_one_step = extra_one_step
+        self.reverse_sigmas = reverse_sigmas
+        self.set_timesteps(num_inference_steps)
+    def set_timesteps(
+        self,
+        num_inference_steps=100,
+        denoising_strength=1.0,
+        training=False,
+        shift=None,
+    ):
+        if shift is not None:
+            self.shift = shift
+        sigma_start = (
+            self.sigma_min + (self.sigma_max - self.sigma_min) * denoising_strength
+        )
+        if self.extra_one_step:
+            self.sigmas = torch.linspace(
+                sigma_start, self.sigma_min, num_inference_steps + 1
+            )[:-1]
+        else:
+            self.sigmas = torch.linspace(
+                sigma_start, self.sigma_min, num_inference_steps
+            )
+        if self.inverse_timesteps:
+            self.sigmas = torch.flip(self.sigmas, dims=[0])
+        self.sigmas = self.shift * self.sigmas / (1 + (self.shift - 1) * self.sigmas)
+        if self.reverse_sigmas:
+            self.sigmas = 1 - self.sigmas
+        self.timesteps = self.sigmas * self.num_train_timesteps
+        if training:
+            x = self.timesteps
+            y = torch.exp(
+                -2 * ((x - num_inference_steps / 2) / num_inference_steps) ** 2
+            )
+            y_shifted = y - y.min()
+            bsmntw_weighing = y_shifted * (num_inference_steps / y_shifted.sum())
+            self.linear_timesteps_weights = bsmntw_weighing
+    def step(self, model_output, timestep, sample, to_final=False):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        if to_final or timestep_id + 1 >= len(self.timesteps):
+            sigma_ = 1 if (self.inverse_timesteps or self.reverse_sigmas) else 0
+        else:
+            sigma_ = self.sigmas[timestep_id + 1]
+        prev_sample = sample + model_output * (sigma_ - sigma)
+        return prev_sample
+    def return_to_timestep(self, timestep, sample, sample_stablized):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        model_output = (sample - sample_stablized) / sigma
+        return model_output
+    def add_noise(self, original_samples, noise, timestep):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample = (1 - sigma) * original_samples + sigma * noise
+        return sample
+    def training_target(self, sample, noise, timestep):
+        target = noise - sample
+        return target
+    def training_weight(self, timestep):
+        timestep_id = torch.argmin(
+            (self.timesteps - timestep.to(self.timesteps.device)).abs()
+        )
+        weights = self.linear_timesteps_weights[timestep_id]
+        return weights

FantasyTalking/diffsynth/vram_management/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .layers import *

FantasyTalking/diffsynth/vram_management/layers.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import copy
+import torch
+from ..models.utils import init_weights_on_device
+def cast_to(weight, dtype, device):
+    r = torch.empty_like(weight, dtype=dtype, device=device)
+    r.copy_(weight)
+    return r
+class AutoWrappedModule(torch.nn.Module):
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        offload_dtype,
+        offload_device,
+        onload_dtype,
+        onload_device,
+        computation_dtype,
+        computation_device,
+    ):
+        super().__init__()
+        self.module = module.to(dtype=offload_dtype, device=offload_device)
+        self.offload_dtype = offload_dtype
+        self.offload_device = offload_device
+        self.onload_dtype = onload_dtype
+        self.onload_device = onload_device
+        self.computation_dtype = computation_dtype
+        self.computation_device = computation_device
+        self.state = 0
+    def offload(self):
+        if self.state == 1 and (
+            self.offload_dtype != self.onload_dtype
+            or self.offload_device != self.onload_device
+        ):
+            self.module.to(dtype=self.offload_dtype, device=self.offload_device)
+            self.state = 0
+    def onload(self):
+        if self.state == 0 and (
+            self.offload_dtype != self.onload_dtype
+            or self.offload_device != self.onload_device
+        ):
+            self.module.to(dtype=self.onload_dtype, device=self.onload_device)
+            self.state = 1
+    def forward(self, *args, **kwargs):
+        if (
+            self.onload_dtype == self.computation_dtype
+            and self.onload_device == self.computation_device
+        ):
+            module = self.module
+        else:
+            module = copy.deepcopy(self.module).to(
+                dtype=self.computation_dtype, device=self.computation_device
+            )
+        return module(*args, **kwargs)
+class AutoWrappedLinear(torch.nn.Linear):
+    def __init__(
+        self,
+        module: torch.nn.Linear,
+        offload_dtype,
+        offload_device,
+        onload_dtype,
+        onload_device,
+        computation_dtype,
+        computation_device,
+    ):
+        with init_weights_on_device(device=torch.device("meta")):
+            super().__init__(
+                in_features=module.in_features,
+                out_features=module.out_features,
+                bias=module.bias is not None,
+                dtype=offload_dtype,
+                device=offload_device,
+            )
+        self.weight = module.weight
+        self.bias = module.bias
+        self.offload_dtype = offload_dtype
+        self.offload_device = offload_device
+        self.onload_dtype = onload_dtype
+        self.onload_device = onload_device
+        self.computation_dtype = computation_dtype
+        self.computation_device = computation_device
+        self.state = 0
+    def offload(self):
+        if self.state == 1 and (
+            self.offload_dtype != self.onload_dtype
+            or self.offload_device != self.onload_device
+        ):
+            self.to(dtype=self.offload_dtype, device=self.offload_device)
+            self.state = 0
+    def onload(self):
+        if self.state == 0 and (
+            self.offload_dtype != self.onload_dtype
+            or self.offload_device != self.onload_device
+        ):
+            self.to(dtype=self.onload_dtype, device=self.onload_device)
+            self.state = 1
+    def forward(self, x, *args, **kwargs):
+        if (
+            self.onload_dtype == self.computation_dtype
+            and self.onload_device == self.computation_device
+        ):
+            weight, bias = self.weight, self.bias
+        else:
+            weight = cast_to(
+                self.weight, self.computation_dtype, self.computation_device
+            )
+            bias = (
+                None
+                if self.bias is None
+                else cast_to(self.bias, self.computation_dtype, self.computation_device)
+            )
+        return torch.nn.functional.linear(x, weight, bias)
+def enable_vram_management_recursively(
+    model: torch.nn.Module,
+    module_map: dict,
+    module_config: dict,
+    max_num_param=None,
+    overflow_module_config: dict = None,
+    total_num_param=0,
+):
+    for name, module in model.named_children():
+        for source_module, target_module in module_map.items():
+            if isinstance(module, source_module):
+                num_param = sum(p.numel() for p in module.parameters())
+                if (
+                    max_num_param is not None
+                    and total_num_param + num_param > max_num_param
+                ):
+                    module_config_ = overflow_module_config
+                else:
+                    module_config_ = module_config
+                module_ = target_module(module, **module_config_)
+                setattr(model, name, module_)
+                total_num_param += num_param
+                break
+        else:
+            total_num_param = enable_vram_management_recursively(
+                module,
+                module_map,
+                module_config,
+                max_num_param,
+                overflow_module_config,
+                total_num_param,
+            )
+    return total_num_param
+def enable_vram_management(
+    model: torch.nn.Module,
+    module_map: dict,
+    module_config: dict,
+    max_num_param=None,
+    overflow_module_config: dict = None,
+):
+    enable_vram_management_recursively(
+        model,
+        module_map,
+        module_config,
+        max_num_param,
+        overflow_module_config,
+        total_num_param=0,
+    )
+    model.vram_management_enabled = True

FantasyTalking/infer.py ADDED Viewed

	@@ -0,0 +1,236 @@

+# Copyright Alibaba Inc. All Rights Reserved.
+import argparse
+import os
+import subprocess
+from datetime import datetime
+from pathlib import Path
+import cv2
+import librosa
+import torch
+from PIL import Image
+from transformers import Wav2Vec2Model, Wav2Vec2Processor
+from diffsynth import ModelManager, WanVideoPipeline
+from model import FantasyTalkingAudioConditionModel
+from utils import get_audio_features, resize_image_by_longest_edge, save_video
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--wan_model_dir",
+        type=str,
+        default="./models/Wan2.1-I2V-14B-720P",
+        required=False,
+        help="The dir of the Wan I2V 14B model.",
+    )
+    parser.add_argument(
+        "--fantasytalking_model_path",
+        type=str,
+        default="./models/fantasytalking_model.ckpt",
+        required=False,
+        help="The .ckpt path of fantasytalking model.",
+    )
+    parser.add_argument(
+        "--wav2vec_model_dir",
+        type=str,
+        default="./models/wav2vec2-base-960h",
+        required=False,
+        help="The dir of wav2vec model.",
+    )
+    parser.add_argument(
+        "--image_path",
+        type=str,
+        default="./assets/images/woman.png",
+        required=False,
+        help="The path of the image.",
+    )
+    parser.add_argument(
+        "--audio_path",
+        type=str,
+        default="./assets/audios/woman.wav",
+        required=False,
+        help="The path of the audio.",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="A woman is talking.",
+        required=False,
+        help="prompt.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./output",
+        help="Dir to save the model.",
+    )
+    parser.add_argument(
+        "--image_size",
+        type=int,
+        default=512,
+        help="The image will be resized proportionally to this size.",
+    )
+    parser.add_argument(
+        "--audio_scale",
+        type=float,
+        default=1.0,
+        help="Audio condition injection weight",
+    )
+    parser.add_argument(
+        "--prompt_cfg_scale",
+        type=float,
+        default=5.0,
+        required=False,
+        help="Prompt cfg scale",
+    )
+    parser.add_argument(
+        "--audio_cfg_scale",
+        type=float,
+        default=5.0,
+        required=False,
+        help="Audio cfg scale",
+    )
+    parser.add_argument(
+        "--max_num_frames",
+        type=int,
+        default=81,
+        required=False,
+        help="The maximum frames for generating videos, the audio part exceeding max_num_frames/fps will be truncated.",
+    )
+    parser.add_argument(
+        "--fps",
+        type=int,
+        default=23,
+        required=False,
+    )
+    parser.add_argument(
+        "--num_persistent_param_in_dit",
+        type=int,
+        default=None,
+        required=False,
+        help="Maximum parameter quantity retained in video memory, small number to reduce VRAM required",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=1111,
+        required=False,
+    )
+    args = parser.parse_args()
+    return args
+def load_models(args):
+    # Load Wan I2V models
+    model_manager = ModelManager(device="cpu")
+    model_manager.load_models(
+        [
+            [
+                f"{args.wan_model_dir}/diffusion_pytorch_model-00001-of-00007.safetensors",
+                f"{args.wan_model_dir}/diffusion_pytorch_model-00002-of-00007.safetensors",
+                f"{args.wan_model_dir}/diffusion_pytorch_model-00003-of-00007.safetensors",
+                f"{args.wan_model_dir}/diffusion_pytorch_model-00004-of-00007.safetensors",
+                f"{args.wan_model_dir}/diffusion_pytorch_model-00005-of-00007.safetensors",
+                f"{args.wan_model_dir}/diffusion_pytorch_model-00006-of-00007.safetensors",
+                f"{args.wan_model_dir}/diffusion_pytorch_model-00007-of-00007.safetensors",
+            ],
+            f"{args.wan_model_dir}/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth",
+            f"{args.wan_model_dir}/models_t5_umt5-xxl-enc-bf16.pth",
+            f"{args.wan_model_dir}/Wan2.1_VAE.pth",
+        ],
+        # torch_dtype=torch.float8_e4m3fn, # You can set `torch_dtype=torch.bfloat16` to disable FP8 quantization.
+        torch_dtype=torch.bfloat16,  # You can set `torch_dtype=torch.bfloat16` to disable FP8 quantization.
+    )
+    pipe = WanVideoPipeline.from_model_manager(
+        model_manager, torch_dtype=torch.bfloat16, device="cuda"
+    )
+    # Load FantasyTalking weights
+    fantasytalking = FantasyTalkingAudioConditionModel(pipe.dit, 768, 2048).to("cuda")
+    fantasytalking.load_audio_processor(args.fantasytalking_model_path, pipe.dit)
+    # You can set `num_persistent_param_in_dit` to a small number to reduce VRAM required.
+    pipe.enable_vram_management(
+        num_persistent_param_in_dit=args.num_persistent_param_in_dit
+    )
+    # Load wav2vec models
+    wav2vec_processor = Wav2Vec2Processor.from_pretrained(args.wav2vec_model_dir)
+    wav2vec = Wav2Vec2Model.from_pretrained(args.wav2vec_model_dir).to("cuda")
+    return pipe, fantasytalking, wav2vec_processor, wav2vec
+def main(args, pipe, fantasytalking, wav2vec_processor, wav2vec):
+    os.makedirs(args.output_dir, exist_ok=True)
+    duration = librosa.get_duration(filename=args.audio_path)
+    num_frames = min(int(args.fps * duration // 4) * 4 + 5, args.max_num_frames)
+    audio_wav2vec_fea = get_audio_features(
+        wav2vec, wav2vec_processor, args.audio_path, args.fps, num_frames
+    )
+    image = resize_image_by_longest_edge(args.image_path, args.image_size)
+    width, height = image.size
+    audio_proj_fea = fantasytalking.get_proj_fea(audio_wav2vec_fea)
+    pos_idx_ranges = fantasytalking.split_audio_sequence(
+        audio_proj_fea.size(1), num_frames=num_frames
+    )
+    audio_proj_split, audio_context_lens = fantasytalking.split_tensor_with_padding(
+        audio_proj_fea, pos_idx_ranges, expand_length=4
+    )  # [b,21,9+8,768]
+    # Image-to-video
+    video_audio = pipe(
+        prompt=args.prompt,
+        negative_prompt="人物静止不动，静止，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+        input_image=image,
+        width=width,
+        height=height,
+        num_frames=num_frames,
+        num_inference_steps=30,
+        seed=args.seed,
+        tiled=True,
+        audio_scale=args.audio_scale,
+        cfg_scale=args.prompt_cfg_scale,
+        audio_cfg_scale=args.audio_cfg_scale,
+        audio_proj=audio_proj_split,
+        audio_context_lens=audio_context_lens,
+        latents_num_frames=(num_frames - 1) // 4 + 1,
+    )
+    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+    save_path_tmp = f"{args.output_dir}/tmp_{Path(args.image_path).stem}_{Path(args.audio_path).stem}_{current_time}.mp4"
+    save_video(video_audio, save_path_tmp, fps=args.fps, quality=5)
+    save_path = f"{args.output_dir}/{Path(args.image_path).stem}_{Path(args.audio_path).stem}_{current_time}.mp4"
+    final_command = [
+        "ffmpeg",
+        "-y",
+        "-i",
+        save_path_tmp,
+        "-i",
+        args.audio_path,
+        "-c:v",
+        "libx264",
+        "-c:a",
+        "aac",
+        "-shortest",
+        save_path,
+    ]
+    subprocess.run(final_command, check=True)
+    os.remove(save_path_tmp)
+    return save_path
+if __name__ == "__main__":
+    args = parse_args()
+    pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(args)
+    main(args, pipe, fantasytalking, wav2vec_processor, wav2vec)

FantasyTalking/infer.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+python infer.py \
+    --image_path ./assets/images/woman.png \
+    --audio_path ./assets/audios/woman.wav \
+    --prompt "A woman is talking." \
+    --max_num_frames 81 \
+    --image_size 512 \
+    --audio_scale 1.0 \
+    --prompt_cfg_scale 5.0 \
+    --audio_cfg_scale 5.0 \
+    --fps 23 \
+    --seed 1111

FantasyTalking/infer_24G.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+CUDA_VISIBLE_DEVICES=2 python infer.py \
+    --image_path ./assets/images/woman.png \
+    --audio_path ./assets/audios/woman.wav \
+    --prompt "A woman is talking." \
+    --max_num_frames 81 \
+    --image_size 512 \
+    --audio_scale 1.0 \
+    --prompt_cfg_scale 5.0 \
+    --audio_cfg_scale 5.0 \
+    --fps 23 \
+    --num_persistent_param_in_dit 7000000000 \
+    --seed 1111

FantasyTalking/model.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from safetensors import safe_open
+from diffsynth.models.wan_video_dit import WanModel, flash_attention, attention
+class AudioProjModel(nn.Module):
+    def __init__(self, audio_in_dim=1024, cross_attention_dim=1024):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.proj = torch.nn.Linear(audio_in_dim, cross_attention_dim, bias=False)
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+    def forward(self, audio_embeds):
+        context_tokens = self.proj(audio_embeds)
+        context_tokens = self.norm(context_tokens)
+        return context_tokens  # [B,L,C]
+class WanCrossAttentionProcessor(nn.Module):
+    def __init__(self, context_dim, hidden_dim):
+        super().__init__()
+        self.context_dim = context_dim
+        self.hidden_dim = hidden_dim
+        self.k_proj = nn.Linear(context_dim, hidden_dim, bias=False)
+        self.v_proj = nn.Linear(context_dim, hidden_dim, bias=False)
+        nn.init.zeros_(self.k_proj.weight)
+        nn.init.zeros_(self.v_proj.weight)
+    def __call__(
+        self,
+        attn: nn.Module,
+        x: torch.Tensor,
+        context: torch.Tensor,
+        context_lens: torch.Tensor,
+        audio_proj: torch.Tensor,
+        audio_context_lens: torch.Tensor,
+        latents_num_frames: int = 21,
+        audio_scale: float = 1.0,
+    ) -> torch.Tensor:
+        """
+        x:              [B, L1, C].
+        context:        [B, L2, C].
+        context_lens:   [B].
+        audio_proj:   [B, 21, L3, C]
+        audio_context_lens: [B*21].
+        """
+        context_img = context[:, :257]
+        context = context[:, 257:]
+        b, n, d = x.size(0), attn.num_heads, attn.head_dim
+        # compute query, key, value
+        q = attn.norm_q(attn.q(x)).view(b, -1, n, d)
+        k = attn.norm_k(attn.k(context)).view(b, -1, n, d)
+        v = attn.v(context).view(b, -1, n, d)
+        k_img = attn.norm_k_img(attn.k_img(context_img)).view(b, -1, n, d)
+        v_img = attn.v_img(context_img).view(b, -1, n, d)
+        img_x = flash_attention(q, k_img, v_img, k_lens=None)
+        # compute attention
+        x = flash_attention(q, k, v, k_lens=context_lens)
+        x = x.flatten(2)
+        img_x = img_x.flatten(2)
+        if len(audio_proj.shape) == 4:
+            audio_q = q.view(b * latents_num_frames, -1, n, d)  # [b, 21, l1, n, d]
+            ip_key = self.k_proj(audio_proj).view(b * latents_num_frames, -1, n, d)
+            ip_value = self.v_proj(audio_proj).view(b * latents_num_frames, -1, n, d)
+            audio_x = attention(
+                audio_q, ip_key, ip_value, k_lens=audio_context_lens
+            )
+            audio_x = audio_x.view(b, q.size(1), n, d)
+            audio_x = audio_x.flatten(2)
+        elif len(audio_proj.shape) == 3:
+            ip_key = self.k_proj(audio_proj).view(b, -1, n, d)
+            ip_value = self.v_proj(audio_proj).view(b, -1, n, d)
+            audio_x = attention(q, ip_key, ip_value, k_lens=audio_context_lens)
+            audio_x = audio_x.flatten(2)
+        # output
+        x = x + img_x + audio_x * audio_scale
+        x = attn.o(x)
+        return x
+class FantasyTalkingAudioConditionModel(nn.Module):
+    def __init__(self, wan_dit: WanModel, audio_in_dim: int, audio_proj_dim: int):
+        super().__init__()
+        self.audio_in_dim = audio_in_dim
+        self.audio_proj_dim = audio_proj_dim
+        # audio proj model
+        self.proj_model = self.init_proj(self.audio_proj_dim)
+        self.set_audio_processor(wan_dit)
+    def init_proj(self, cross_attention_dim=5120):
+        proj_model = AudioProjModel(
+            audio_in_dim=self.audio_in_dim, cross_attention_dim=cross_attention_dim
+        )
+        return proj_model
+    def set_audio_processor(self, wan_dit):
+        attn_procs = {}
+        for name in wan_dit.attn_processors.keys():
+            attn_procs[name] = WanCrossAttentionProcessor(
+                context_dim=self.audio_proj_dim, hidden_dim=wan_dit.dim
+            )
+        wan_dit.set_attn_processor(attn_procs)
+    def load_audio_processor(self, ip_ckpt: str, wan_dit):
+        if os.path.splitext(ip_ckpt)[-1] == ".safetensors":
+            state_dict = {"proj_model": {}, "audio_processor": {}}
+            with safe_open(ip_ckpt, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("proj_model."):
+                        state_dict["proj_model"][
+                            key.replace("proj_model.", "")
+                        ] = f.get_tensor(key)
+                    elif key.startswith("audio_processor."):
+                        state_dict["audio_processor"][
+                            key.replace("audio_processor.", "")
+                        ] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(ip_ckpt, map_location="cpu")
+        self.proj_model.load_state_dict(state_dict["proj_model"])
+        wan_dit.load_state_dict(state_dict["audio_processor"], strict=False)
+    def get_proj_fea(self, audio_fea=None):
+        return self.proj_model(audio_fea) if audio_fea is not None else None
+    def split_audio_sequence(self, audio_proj_length, num_frames=81):
+        """
+        Map the audio feature sequence to corresponding latent frame slices.
+        Args:
+            audio_proj_length (int): The total length of the audio feature sequence
+                                    (e.g., 173 in audio_proj[1, 173, 768]).
+            num_frames (int): The number of video frames in the training data (default: 81).
+        Returns:
+            list: A list of [start_idx, end_idx] pairs. Each pair represents the index range
+                (within the audio feature sequence) corresponding to a latent frame.
+        """
+        # Average number of tokens per original video frame
+        tokens_per_frame = audio_proj_length / num_frames
+        # Each latent frame covers 4 video frames, and we want the center
+        tokens_per_latent_frame = tokens_per_frame * 4
+        half_tokens = int(tokens_per_latent_frame / 2)
+        pos_indices = []
+        for i in range(int((num_frames - 1) / 4) + 1):
+            if i == 0:
+                pos_indices.append(0)
+            else:
+                start_token = tokens_per_frame * ((i - 1) * 4 + 1)
+                end_token = tokens_per_frame * (i * 4 + 1)
+                center_token = int((start_token + end_token) / 2) - 1
+                pos_indices.append(center_token)
+        # Build index ranges centered around each position
+        pos_idx_ranges = [[idx - half_tokens, idx + half_tokens] for idx in pos_indices]
+        # Adjust the first range to avoid negative start index
+        pos_idx_ranges[0] = [
+            -(half_tokens * 2 - pos_idx_ranges[1][0]),
+            pos_idx_ranges[1][0],
+        ]
+        return pos_idx_ranges
+    def split_tensor_with_padding(self, input_tensor, pos_idx_ranges, expand_length=0):
+        """
+        Split the input tensor into subsequences based on index ranges, and apply right-side zero-padding
+        if the range exceeds the input boundaries.
+        Args:
+            input_tensor (Tensor): Input audio tensor of shape [1, L, 768].
+            pos_idx_ranges (list): A list of index ranges, e.g. [[-7, 1], [1, 9], ..., [165, 173]].
+            expand_length (int): Number of tokens to expand on both sides of each subsequence.
+        Returns:
+            sub_sequences (Tensor): A tensor of shape [1, F, L, 768], where L is the length after padding.
+                                    Each element is a padded subsequence.
+            k_lens (Tensor): A tensor of shape [F], representing the actual (unpadded) length of each subsequence.
+                            Useful for ignoring padding tokens in attention masks.
+        """
+        pos_idx_ranges = [
+            [idx[0] - expand_length, idx[1] + expand_length] for idx in pos_idx_ranges
+        ]
+        sub_sequences = []
+        seq_len = input_tensor.size(1)  # 173
+        max_valid_idx = seq_len - 1  # 172
+        k_lens_list = []
+        for start, end in pos_idx_ranges:
+            # Calculate the fill amount
+            pad_front = max(-start, 0)
+            pad_back = max(end - max_valid_idx, 0)
+            # Calculate the start and end indices of the valid part
+            valid_start = max(start, 0)
+            valid_end = min(end, max_valid_idx)
+            # Extract the valid part
+            if valid_start <= valid_end:
+                valid_part = input_tensor[:, valid_start : valid_end + 1, :]
+            else:
+                valid_part = input_tensor.new_zeros((1, 0, input_tensor.size(2)))
+            # In the sequence dimension (the 1st dimension) perform padding
+            padded_subseq = F.pad(
+                valid_part,
+                (0, 0, 0, pad_back + pad_front, 0, 0),
+                mode="constant",
+                value=0,
+            )
+            k_lens_list.append(padded_subseq.size(-2) - pad_back - pad_front)
+            sub_sequences.append(padded_subseq)
+        return torch.stack(sub_sequences, dim=1), torch.tensor(
+            k_lens_list, dtype=torch.long
+        )

FantasyTalking/requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch>=2.0.0
+torchvision
+cupy-cuda12x
+transformers==4.46.2
+controlnet-aux==0.0.7
+imageio
+imageio[ffmpeg]
+safetensors
+einops
+sentencepiece
+protobuf
+modelscope
+ftfy
+librosa

FantasyTalking/utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright Alibaba Inc. All Rights Reserved.
+import imageio
+import librosa
+import numpy as np
+import torch
+from PIL import Image
+from tqdm import tqdm
+def resize_image_by_longest_edge(image_path, target_size):
+    image = Image.open(image_path).convert("RGB")
+    width, height = image.size
+    scale = target_size / max(width, height)
+    new_size = (int(width * scale), int(height * scale))
+    return image.resize(new_size, Image.LANCZOS)
+def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None):
+    writer = imageio.get_writer(
+        save_path, fps=fps, quality=quality, ffmpeg_params=ffmpeg_params
+    )
+    for frame in tqdm(frames, desc="Saving video"):
+        frame = np.array(frame)
+        writer.append_data(frame)
+    writer.close()
+def get_audio_features(wav2vec, audio_processor, audio_path, fps, num_frames):
+    sr = 16000
+    audio_input, sample_rate = librosa.load(audio_path, sr=sr)  # 采样率为 16kHz
+    start_time = 0
+    # end_time = (0 + (num_frames - 1) * 1) / fps
+    end_time = num_frames / fps
+    start_sample = int(start_time * sr)
+    end_sample = int(end_time * sr)
+    try:
+        audio_segment = audio_input[start_sample:end_sample]
+    except:
+        audio_segment = audio_input
+    input_values = audio_processor(
+        audio_segment, sampling_rate=sample_rate, return_tensors="pt"
+    ).input_values.to("cuda")
+    with torch.no_grad():
+        fea = wav2vec(input_values).last_hidden_state
+    return fea