Spaces:

Leoxing
/

PIA

Runtime error

App Files Files Community

LeoXing1996 commited on Dec 25, 2023

Commit

a001281

0 Parent(s):

init repo for fg

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +44 -0
README.md +10 -0
__assets__/image_animation/magnitude/1.mp4 +0 -0
__assets__/image_animation/magnitude/2.mp4 +0 -0
__assets__/image_animation/magnitude/3.mp4 +0 -0
__assets__/image_animation/magnitude/genshin/3.mp4 +0 -0
__assets__/image_animation/rcnz/1.mp4 +0 -0
__assets__/image_animation/rcnz/2.mp4 +0 -0
__assets__/image_animation/rcnz/3.mp4 +0 -0
__assets__/image_animation/real/1.mp4 +0 -0
__assets__/image_animation/real/2.mp4 +0 -0
__assets__/image_animation/real/3.mp4 +0 -0
__assets__/image_animation/style_transfer/anya/2.mp4 +0 -0
__assets__/image_animation/yanhong/yanhong.mp4 +0 -0
__assets__/image_animation/yanhong/yanhong.png +0 -0
__assets__/image_animation/yiming/yiming.jpeg +0 -0
__assets__/image_animation/yiming/yiming.mp4 +0 -0
animatediff/data/dataset.py +128 -0
animatediff/data/dataset_web.py +205 -0
animatediff/data/video_transformer.py +407 -0
animatediff/models/__init__.py +0 -0
animatediff/models/attention.py +559 -0
animatediff/models/motion_module.py +555 -0
animatediff/models/resnet.py +197 -0
animatediff/models/unet.py +572 -0
animatediff/models/unet_blocks.py +733 -0
animatediff/pipelines/__init__.py +5 -0
animatediff/pipelines/i2v_pipeline.py +775 -0
animatediff/pipelines/pipeline_animation.py +446 -0
animatediff/pipelines/validation_pipeline.py +504 -0
animatediff/utils/convert_from_ckpt.py +964 -0
animatediff/utils/convert_lora_safetensor_to_diffusers.py +208 -0
animatediff/utils/util.py +255 -0
app-counterfeit-only.py +441 -0
app-huggingface.py +525 -0
app.py +567 -0
benchmark.py +47 -0
configs/indomain/base.yaml +14 -0
configs/indomain/real.yaml +45 -0
configs/inference/inference.yaml +26 -0
configs/prompts/1-ToonYou.yaml +22 -0
configs/prompts/1.yaml +20 -0
configs/prompts/2-Lyriel.yaml +22 -0
configs/prompts/3-RcnzCartoon.yaml +22 -0
configs/prompts/4-MajicMix.yaml +22 -0
configs/prompts/5-RealisticVision.yaml +22 -0
configs/prompts/6-Tusun.yaml +20 -0
configs/prompts/7-FilmVelvia.yaml +23 -0
configs/prompts/8-GhibliBackground.yaml +20 -0
configs/training/image_finetune.yaml +48 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,44 @@

+*.pkl
+*.pt
+*.mov
+*.pth
+*.json
+*.mov
+*.npz
+*.npy
+*.boj
+*.onnx
+*.tar
+*.bin
+cache*
+batch*
+*.jpg
+*.png
+*.mp4
+*.gif
+*.ckpt
+*.safetensors
+*.zip
+*.csv
+**/__pycache__/
+samples/
+wandb/
+outputs/
+!pia.png
+.DS_Store
+!__assets__/image_animation/magnitude/1.mp4
+!__assets__/image_animation/magnitude/2.mp4
+!__assets__/image_animation/magnitude/3.mp4
+!__assets__/image_animation/style_transfer/anya/2.mp4
+!__assets__/image_animation/magnitude/genshin/3.mp4
+!__assets__/image_animation/rcnz/1.mp4
+!__assets__/image_animation/rcnz/2.mp4
+!__assets__/image_animation/rcnz/3.mp4
+!__assets__/image_animation/real/1.mp4
+!__assets__/image_animation/real/2.mp4
+!__assets__/image_animation/real/3.mp4
+!__assets__/image_animation/yiming/yiming.mp4
+!__assets__/image_animation/yanhong/yanhong.mp4
+!__assets__/image_animation/yanhong/yanhong.png

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: Demo Space
+emoji: 🤗
+colorFrom: yellow
+colorTo: orange
+sdk: gradio
+sdk_version: 4.7.1
+app_file: app-huggingface.py
+pinned: false
+---

__assets__/image_animation/magnitude/1.mp4 ADDED Viewed

Binary file (217 kB). View file

__assets__/image_animation/magnitude/2.mp4 ADDED Viewed

Binary file (201 kB). View file

__assets__/image_animation/magnitude/3.mp4 ADDED Viewed

Binary file (230 kB). View file

__assets__/image_animation/magnitude/genshin/3.mp4 ADDED Viewed

Binary file (247 kB). View file

__assets__/image_animation/rcnz/1.mp4 ADDED Viewed

Binary file (187 kB). View file

__assets__/image_animation/rcnz/2.mp4 ADDED Viewed

Binary file (241 kB). View file

__assets__/image_animation/rcnz/3.mp4 ADDED Viewed

Binary file (182 kB). View file

__assets__/image_animation/real/1.mp4 ADDED Viewed

Binary file (194 kB). View file

__assets__/image_animation/real/2.mp4 ADDED Viewed

Binary file (172 kB). View file

__assets__/image_animation/real/3.mp4 ADDED Viewed

Binary file (355 kB). View file

__assets__/image_animation/style_transfer/anya/2.mp4 ADDED Viewed

Binary file (106 kB). View file

__assets__/image_animation/yanhong/yanhong.mp4 ADDED Viewed

Binary file (221 kB). View file

__assets__/image_animation/yanhong/yanhong.png ADDED Viewed

__assets__/image_animation/yiming/yiming.jpeg ADDED Viewed

__assets__/image_animation/yiming/yiming.mp4 ADDED Viewed

Binary file (97.7 kB). View file

animatediff/data/dataset.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os, io, csv, math, random
+import numpy as np
+from einops import rearrange
+from decord import VideoReader
+import torch
+import torchvision.transforms as transforms
+from torch.utils.data.dataset import Dataset
+from PIA.utils.util import zero_rank_print, detect_edges
+import cv2
+def get_score(video_data,
+              cond_frame_idx,
+              weight=[1.0, 1.0, 1.0, 1.0],
+              use_edge=True):
+    """
+        Similar to get_score under utils/util.py/detect_edges
+    """
+    """
+        the shape of video_data is f c h w, np.ndarray
+    """
+    h, w = video_data.shape[1], video_data.shape[2]
+    cond_frame = video_data[cond_frame_idx]
+    cond_hsv_list = list(
+        cv2.split(
+            cv2.cvtColor(cond_frame.astype(np.float32), cv2.COLOR_RGB2HSV)))
+    if use_edge:
+        cond_frame_lum = cond_hsv_list[-1]
+        cond_frame_edge = detect_edges(cond_frame_lum.astype(np.uint8))
+        cond_hsv_list.append(cond_frame_edge)
+    score_sum = []
+    for frame_idx in range(video_data.shape[0]):
+        frame = video_data[frame_idx]
+        hsv_list = list(
+            cv2.split(cv2.cvtColor(frame.astype(np.float32),
+                                   cv2.COLOR_RGB2HSV)))
+        if use_edge:
+            frame_img_lum = hsv_list[-1]
+            frame_img_edge = detect_edges(lum=frame_img_lum.astype(np.uint8))
+            hsv_list.append(frame_img_edge)
+        hsv_diff = [
+            np.abs(hsv_list[c] - cond_hsv_list[c]) for c in range(len(weight))
+        ]
+        hsv_mse = [np.sum(hsv_diff[c]) * weight[c] for c in range(len(weight))]
+        score_sum.append(sum(hsv_mse) / (h * w) / (sum(weight)))
+    return score_sum
+class WebVid10M(Dataset):
+    def __init__(
+            self,
+            csv_path, video_folder,
+            sample_size=256, sample_stride=4, sample_n_frames=16,
+            is_image=False,
+        ):
+        zero_rank_print(f"loading annotations from {csv_path} ...")
+        with open(csv_path, 'r') as csvfile:
+            self.dataset = list(csv.DictReader(csvfile))
+        self.length = len(self.dataset)
+        zero_rank_print(f"data scale: {self.length}")
+        self.video_folder    = video_folder
+        self.sample_stride   = sample_stride
+        self.sample_n_frames = sample_n_frames
+        self.is_image        = is_image
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.pixel_transforms = transforms.Compose([
+            transforms.RandomHorizontalFlip(),
+            transforms.Resize(sample_size[0]),
+            transforms.CenterCrop(sample_size),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+    def get_batch(self, idx):
+        video_dict = self.dataset[idx]
+        videoid, name, page_dir = video_dict['videoid'], video_dict['name'], video_dict['page_dir']
+        video_dir    = os.path.join(self.video_folder, f"{videoid}.mp4")
+        video_reader = VideoReader(video_dir)
+        video_length = len(video_reader)
+        total_frames = len(video_reader)
+        clip_length = min(video_length, (self.sample_n_frames - 1) * self.sample_stride + 1)
+        start_idx   = random.randint(0, video_length - clip_length)
+        batch_index = np.linspace(start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int)
+        frame_indice = [random.randint(0, total_frames - 1)]
+        pixel_values_np = video_reader.get_batch(frame_indice).asnumpy()
+        cond_frames = random.randint(0, self.sample_n_frames - 1)
+        # f h w c -> f c h w
+        pixel_values = torch.from_numpy(pixel_values_np).permute(0, 3, 1, 2).contiguous()
+        pixel_values = pixel_values / 255.
+        del video_reader
+        if self.is_image:
+            pixel_values = pixel_values[0]
+        return pixel_values, name, cond_frames, videoid
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                video, name, cond_frames, videoid = self.get_batch(idx)
+                break
+            except Exception as e:
+                # zero_rank_print(e)
+                idx = random.randint(0, self.length-1)
+        video  = self.pixel_transforms(video)
+        video_ = video.clone().permute(0, 2, 3, 1).numpy() / 2 + 0.5
+        video_ = video_ * 255
+        #video_ = video_.astype(np.uint8)
+        score  = get_score(video_, cond_frame_idx=cond_frames)
+        del video_
+        sample = dict(pixel_values=video, text=name, score=score, cond_frames=cond_frames, vid=videoid)
+        return sample

animatediff/data/dataset_web.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import decord
+import cv2
+import os, io, csv, torch, math, random
+from typing import Optional
+from einops import rearrange
+import numpy as np
+from decord import VideoReader
+from petrel_client.client import Client
+from torch.utils.data.dataset import Dataset
+import torchvision.transforms as transforms
+from torch.utils.data.distributed import DistributedSampler
+import animatediff.data.video_transformer as video_transforms
+from animatediff.utils.util import zero_rank_print, detect_edges, prepare_mask_coef_by_score
+def get_score(video_data,
+              cond_frame_idx,
+              weight=[1.0, 1.0, 1.0, 1.0],
+              use_edge=True):
+    """
+        Similar to get_score under utils/util.py/detect_edges
+    """
+    """
+        the shape of video_data is f c h w, np.ndarray
+    """
+    h, w = video_data.shape[1], video_data.shape[2]
+    cond_frame = video_data[cond_frame_idx]
+    cond_hsv_list = list(
+        cv2.split(
+            cv2.cvtColor(cond_frame.astype(np.float32), cv2.COLOR_RGB2HSV)))
+    if use_edge:
+        cond_frame_lum = cond_hsv_list[-1]
+        cond_frame_edge = detect_edges(cond_frame_lum.astype(np.uint8))
+        cond_hsv_list.append(cond_frame_edge)
+    score_sum = []
+    for frame_idx in range(video_data.shape[0]):
+        frame = video_data[frame_idx]
+        hsv_list = list(
+            cv2.split(cv2.cvtColor(frame.astype(np.float32),
+                                   cv2.COLOR_RGB2HSV)))
+        if use_edge:
+            frame_img_lum = hsv_list[-1]
+            frame_img_edge = detect_edges(lum=frame_img_lum.astype(np.uint8))
+            hsv_list.append(frame_img_edge)
+        hsv_diff = [
+            np.abs(hsv_list[c] - cond_hsv_list[c]) for c in range(len(weight))
+        ]
+        hsv_mse = [np.sum(hsv_diff[c]) * weight[c] for c in range(len(weight))]
+        score_sum.append(sum(hsv_mse) / (h * w) / (sum(weight)))
+    return score_sum
+class WebVid10M(Dataset):
+    def __init__(
+            self,
+            csv_path,
+            sample_n_frames, sample_stride,
+            sample_size=[320,512],
+            conf_path="~/petreloss.conf",
+            static_video=False,
+            is_image=False,
+        ):
+        zero_rank_print(f"initializing ceph client ...")
+        self._client          = Client(conf_path=conf_path, enable_mc=True)
+        self.sample_n_frames  = sample_n_frames
+        self.sample_stride    = sample_stride
+        self.temporal_sampler = video_transforms.TemporalRandomCrop(sample_n_frames * sample_stride)
+        self.static_video     = static_video
+        self.is_image         = is_image
+        zero_rank_print(f"(~1 mins) loading annotations from {csv_path} ...")
+        with open(csv_path, 'r') as csvfile:
+            self.dataset = list(csv.DictReader(csvfile))
+        self.length = len(self.dataset)
+        zero_rank_print(f"data scale: {self.length}")
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.pixel_transforms = transforms.Compose([
+            transforms.RandomHorizontalFlip(),
+            transforms.Resize(sample_size[0]),
+            transforms.CenterCrop(sample_size),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+    def get_batch(self, idx):
+        video_dict = self.dataset[idx]
+        videoid, name, page_dir = video_dict['videoid'], video_dict['name'], video_dict['page_dir']
+        ceph_dir = f"webvideo:s3://WebVid10M/{page_dir}/{videoid}.mp4"
+        video_bytes = self._client.Get(ceph_dir)
+        video_bytes = io.BytesIO(video_bytes)
+        # ensure not reading zero byte
+        assert video_bytes.getbuffer().nbytes != 0
+        video_reader = VideoReader(video_bytes)
+        total_frames = len(video_reader)
+        if not self.is_image:
+            if self.static_video:
+                frame_indice = random.randint(0, total_frames-1)
+                frame_indice = np.linspace(frame_indice, frame_indice, self.sample_n_frames, dtype=int)
+            else:
+                start_frame_ind, end_frame_ind = self.temporal_sampler(total_frames)
+                assert end_frame_ind - start_frame_ind >= self.sample_n_frames
+                frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, self.sample_n_frames, dtype=int)
+        else:
+            frame_indice = [random.randint(0, total_frames - 1)]
+        pixel_values_np = video_reader.get_batch(frame_indice).asnumpy()
+        cond_frames = random.randint(0, self.sample_n_frames - 1)
+        # f h w c -> f c h w
+        pixel_values = torch.from_numpy(pixel_values_np).permute(0, 3, 1, 2).contiguous()
+        pixel_values = pixel_values / 255.
+        del video_reader
+        if self.is_image:
+            pixel_values = pixel_values[0]
+        return pixel_values, name, cond_frames, videoid
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                video, name, cond_frames, videoid = self.get_batch(idx)
+                break
+            except Exception as e:
+                # zero_rank_print(e)
+                idx = random.randint(0, self.length-1)
+        video  = self.pixel_transforms(video)
+        video_ = video.clone().permute(0, 2, 3, 1).numpy() / 2 + 0.5
+        video_ = video_ * 255
+        #video_ = video_.astype(np.uint8)
+        score  = get_score(video_, cond_frame_idx=cond_frames)
+        del video_
+        sample = dict(pixel_values=video, text=name, score=score, cond_frames=cond_frames, vid=videoid)
+        return sample
+if __name__ == "__main__":
+    dataset = WebVid10M(
+        csv_path="results_10M_train.csv",
+        sample_size=(320,512),
+        sample_n_frames=16,
+        sample_stride=4,
+        static_video=False,
+        is_image=False,
+    )
+    distributed_sampler = DistributedSampler(
+        dataset,
+        num_replicas=1,
+        rank=0,
+        shuffle=True,
+        seed=5,
+    )
+    batch_size = 1
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=0, sampler=distributed_sampler)
+    STATISTIC = [[0., 0.],
+        [0.3535855, 24.23687346],
+        [0.91609545, 30.65091947],
+        [1.41165152, 34.40093286],
+        [1.56943881, 36.99639585],
+        [1.73182842, 39.42044163],
+        [1.82733002, 40.94703526],
+        [1.88060527, 42.66233244],
+        [1.96208071, 43.73070788],
+        [2.02723091, 44.25965378],
+        [2.10820894, 45.66120213],
+        [2.21115041, 46.29561324],
+        [2.23412351, 47.08810863],
+        [2.29430165, 47.9515062],
+        [2.32986362, 48.69085638],
+        [2.37310751, 49.19931439]]
+    for idx, batch in enumerate(dataloader):
+        pixel_values, texts, vid = batch['pixel_values'], batch['text'], batch['vid']
+        pixel_values = (pixel_values.clone()) / 2. + 0.5
+        pixel_values*= 255
+        score        = get_score(pixel_values)
+        cond_frames  = [0] * len(batch_size)
+        score        = prepare_mask_coef_by_score(pixel_values, cond_frames, statistic=STATISTIC)
+        print(f'num: {idx}, diff: {score}')

animatediff/data/video_transformer.py ADDED Viewed

	@@ -0,0 +1,407 @@

+import torch
+import random
+import numbers
+from torchvision.transforms import RandomCrop, RandomResizedCrop
+def _is_tensor_video_clip(clip):
+    if not torch.is_tensor(clip):
+        raise TypeError("clip should be Tensor. Got %s" % type(clip))
+    if not clip.ndimension() == 4:
+        raise ValueError("clip should be 4D. Got %dD" % clip.dim())
+    return True
+def crop(clip, i, j, h, w):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+    """
+    if len(clip.size()) != 4:
+        raise ValueError("clip should be a 4D tensor")
+    return clip[..., i : i + h, j : j + w]
+def resize(clip, target_size, interpolation_mode):
+    if len(target_size) != 2:
+        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+    return torch.nn.functional.interpolate(clip, size=target_size, mode=interpolation_mode, align_corners=False)
+def resize_scale(clip, target_size, interpolation_mode):
+    if len(target_size) != 2:
+        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+    _, _, H, W = clip.shape
+    scale_ = target_size[0] / min(H, W)
+    return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False)
+def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
+    """
+    Do spatial cropping and resizing to the video clip
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        i (int): i in (i,j) i.e coordinates of the upper left corner.
+        j (int): j in (i,j) i.e coordinates of the upper left corner.
+        h (int): Height of the cropped region.
+        w (int): Width of the cropped region.
+        size (tuple(int, int)): height and width of resized clip
+    Returns:
+        clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    clip = crop(clip, i, j, h, w)
+    clip = resize(clip, size, interpolation_mode)
+    return clip
+def center_crop(clip, crop_size):
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    th, tw = crop_size
+    if h < th or w < tw:
+        raise ValueError("height and width must be no smaller than crop_size")
+    i = int(round((h - th) / 2.0))
+    j = int(round((w - tw) / 2.0))
+    return crop(clip, i, j, th, tw)
+def random_shift_crop(clip):
+    '''
+    Slide along the long edge, with the short edge as crop size
+    '''
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    if h <= w:
+        long_edge = w
+        short_edge = h
+    else:
+        long_edge = h
+        short_edge =w
+    th, tw = short_edge, short_edge
+    i = torch.randint(0, h - th + 1, size=(1,)).item()
+    j = torch.randint(0, w - tw + 1, size=(1,)).item()
+    return crop(clip, i, j, th, tw)
+def to_tensor(clip):
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimensions of clip tensor
+    Args:
+        clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+    Return:
+        clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+    """
+    _is_tensor_video_clip(clip)
+    if not clip.dtype == torch.uint8:
+        raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
+    # return clip.float().permute(3, 0, 1, 2) / 255.0
+    return clip.float() / 255.0
+def normalize(clip, mean, std, inplace=False):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
+        mean (tuple): pixel RGB mean. Size is (3)
+        std (tuple): pixel standard deviation. Size is (3)
+    Returns:
+        normalized clip (torch.tensor): Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    if not inplace:
+        clip = clip.clone()
+    mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
+    print(mean)
+    std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
+    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
+    return clip
+def hflip(clip):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
+    Returns:
+        flipped clip (torch.tensor): Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    return clip.flip(-1)
+class RandomCropVideo:
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: randomly cropped video clip.
+                size is (T, C, OH, OW)
+        """
+        i, j, h, w = self.get_params(clip)
+        return crop(clip, i, j, h, w)
+    def get_params(self, clip):
+        h, w = clip.shape[-2:]
+        th, tw = self.size
+        if h < th or w < tw:
+            raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}")
+        if w == tw and h == th:
+            return 0, 0, h, w
+        i = torch.randint(0, h - th + 1, size=(1,)).item()
+        j = torch.randint(0, w - tw + 1, size=(1,)).item()
+        return i, j, th, tw
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size})"
+class UCFCenterCropVideo:
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized / center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_resize = resize_scale(clip=clip, target_size=self.size, interpolation_mode=self.interpolation_mode)
+        clip_center_crop = center_crop(clip_resize, self.size)
+        return clip_center_crop
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class KineticsRandomCropResizeVideo:
+    '''
+    Slide along the long edge, with the short edge as crop size. And resie to the desired size.
+    '''
+    def __init__(
+            self,
+            size,
+            interpolation_mode="bilinear",
+         ):
+        if isinstance(size, tuple):
+                if len(size) != 2:
+                    raise ValueError(f"size should be tuple (height, width), instead got {size}")
+                self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        clip_random_crop = random_shift_crop(clip)
+        clip_resize = resize(clip_random_crop, self.size, self.interpolation_mode)
+        return clip_resize
+class CenterCropVideo:
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_center_crop = center_crop(clip, self.size)
+        return clip_center_crop
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class NormalizeVideo:
+    """
+    Normalize the video clip by mean subtraction and division by standard deviation
+    Args:
+        mean (3-tuple): pixel RGB mean
+        std (3-tuple): pixel RGB standard deviation
+        inplace (boolean): whether do in-place normalization
+    """
+    def __init__(self, mean, std, inplace=False):
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W)
+        """
+        return normalize(clip, self.mean, self.std, self.inplace)
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})"
+class ToTensorVideo:
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimensions of clip tensor
+    """
+    def __init__(self):
+        pass
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+        Return:
+            clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+        """
+        return to_tensor(clip)
+    def __repr__(self) -> str:
+        return self.__class__.__name__
+class RandomHorizontalFlipVideo:
+    """
+    Flip the video clip along the horizontal direction with a given probability
+    Args:
+        p (float): probability of the clip being flipped. Default value is 0.5
+    """
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Size is (T, C, H, W)
+        Return:
+            clip (torch.tensor): Size is (T, C, H, W)
+        """
+        if random.random() < self.p:
+            clip = hflip(clip)
+        return clip
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(p={self.p})"
+#  ------------------------------------------------------------
+#  ---------------------  Sampling  ---------------------------
+#  ------------------------------------------------------------
+class TemporalRandomCrop(object):
+	"""Temporally crop the given frame indices at a random location.
+	Args:
+		size (int): Desired length of frames will be seen in the model.
+	"""
+	def __init__(self, size):
+		self.size = size
+	def __call__(self, total_frames):
+		rand_end = max(0, total_frames - self.size - 1)
+		begin_index = random.randint(0, rand_end)
+		end_index = min(begin_index + self.size, total_frames)
+		return begin_index, end_index
+if __name__ == '__main__':
+    from torchvision import transforms
+    import torchvision.io as io
+    import numpy as np
+    from torchvision.utils import save_image
+    import os
+    vframes, aframes, info = io.read_video(
+    filename='./v_Archery_g01_c03.avi',
+    pts_unit='sec',
+    output_format='TCHW'
+    )
+    trans = transforms.Compose([
+        ToTensorVideo(),
+        RandomHorizontalFlipVideo(),
+        UCFCenterCropVideo(512),
+        # NormalizeVideo(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+    ])
+    target_video_len = 32
+    frame_interval = 1
+    total_frames = len(vframes)
+    print(total_frames)
+    temporal_sample = TemporalRandomCrop(target_video_len * frame_interval)
+    # Sampling video frames
+    start_frame_ind, end_frame_ind = temporal_sample(total_frames)
+    # print(start_frame_ind)
+    # print(end_frame_ind)
+    assert end_frame_ind - start_frame_ind >= target_video_len
+    frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, target_video_len, dtype=int)
+    print(frame_indice)
+    select_vframes = vframes[frame_indice]
+    print(select_vframes.shape)
+    print(select_vframes.dtype)
+    select_vframes_trans = trans(select_vframes)
+    print(select_vframes_trans.shape)
+    print(select_vframes_trans.dtype)
+    select_vframes_trans_int = ((select_vframes_trans * 0.5 + 0.5) * 255).to(dtype=torch.uint8)
+    print(select_vframes_trans_int.dtype)
+    print(select_vframes_trans_int.permute(0, 2, 3, 1).shape)
+    io.write_video('./test.avi', select_vframes_trans_int.permute(0, 2, 3, 1), fps=8)
+    for i in range(target_video_len):
+        save_image(select_vframes_trans[i], os.path.join('./test000', '%04d.png' % i), normalize=True, value_range=(-1, 1))

animatediff/models/__init__.py ADDED Viewed

File without changes

animatediff/models/attention.py ADDED Viewed

	@@ -0,0 +1,559 @@

+# Adapted from https://github.com/guoyww/AnimateDiff
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models import ModelMixin
+from diffusers.models.attention import Attention
+from diffusers.utils import BaseOutput
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention import FeedForward, AdaLayerNorm
+from einops import rearrange, repeat
+import pdb
+@dataclass
+class Transformer3DModelOutput(BaseOutput):
+    sample: torch.FloatTensor
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+class Transformer3DModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        unet_use_cross_frame_attention=None,
+        unet_use_temporal_attention=None,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # Define input layers
+        self.in_channels = in_channels
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        if use_linear_projection:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        # Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        if use_linear_projection:
+            self.proj_out = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True):
+        # Input
+        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+        video_length = hidden_states.shape[2]
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        encoder_hidden_states = repeat(encoder_hidden_states, 'b n c -> (b f) n c', f=video_length)
+        batch, channel, height, weight = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        if not self.use_linear_projection:
+            hidden_states = self.proj_in(hidden_states)
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+        else:
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+            hidden_states = self.proj_in(hidden_states)
+        # Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep=timestep,
+                video_length=video_length
+            )
+        # Output
+        if not self.use_linear_projection:
+            hidden_states = (
+                hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+            )
+            hidden_states = self.proj_out(hidden_states)
+        else:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = (
+                hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+            )
+        output = hidden_states + residual
+        output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
+        if not return_dict:
+            return (output,)
+        return Transformer3DModelOutput(sample=output)
+class BasicTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        unet_use_cross_frame_attention = None,
+        unet_use_temporal_attention = None,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm = num_embeds_ada_norm is not None
+        self.unet_use_cross_frame_attention = unet_use_cross_frame_attention
+        self.unet_use_temporal_attention = unet_use_temporal_attention
+        # SC-Attn
+        assert unet_use_cross_frame_attention is not None
+        if unet_use_cross_frame_attention:
+            self.attn1 = SparseCausalAttention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                upcast_attention=upcast_attention,
+            )
+        else:
+            self.attn1 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )
+        self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+        # Cross-Attn
+        if cross_attention_dim is not None:
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )
+        else:
+            self.attn2 = None
+        if cross_attention_dim is not None:
+            self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+        else:
+            self.norm2 = None
+        # Feed-forward
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+        self.norm3 = nn.LayerNorm(dim)
+        # Temp-Attn
+        assert unet_use_temporal_attention is not None
+        if unet_use_temporal_attention:
+            self.attn_temp = Attention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )
+            nn.init.zeros_(self.attn_temp.to_out[0].weight.data)
+            self.norm_temp = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, attention_mask=None, video_length=None):
+        # SparseCausal-Attention
+        norm_hidden_states = (
+            self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
+        )
+        # if self.only_cross_attention:
+        #     hidden_states = (
+        #         self.attn1(norm_hidden_states, encoder_hidden_states, attention_mask=attention_mask) + hidden_states
+        #     )
+        # else:
+        #     hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length) + hidden_states
+        # pdb.set_trace()
+        if self.unet_use_cross_frame_attention:
+            hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length) + hidden_states
+        else:
+            hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask) + hidden_states
+        if self.attn2 is not None:
+            # Cross-Attention
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+            )
+            hidden_states = (
+                self.attn2(
+                    norm_hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
+                )
+                + hidden_states
+            )
+        # Feed-forward
+        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
+        # Temporal-Attention
+        if self.unet_use_temporal_attention:
+            d = hidden_states.shape[1]
+            hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+            norm_hidden_states = (
+                self.norm_temp(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_temp(hidden_states)
+            )
+            hidden_states = self.attn_temp(norm_hidden_states) + hidden_states
+            hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
+        return hidden_states
+class CrossAttention(nn.Module):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias=False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        added_kv_proj_dim: Optional[int] = None,
+        norm_num_groups: Optional[int] = None,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+        self._slice_size = None
+        self._use_memory_efficient_attention_xformers = False
+        self.added_kv_proj_dim = added_kv_proj_dim
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(num_channels=inner_dim, num_groups=norm_num_groups, eps=1e-5, affine=True)
+        else:
+            self.group_norm = None
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
+        self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(inner_dim, query_dim))
+        self.to_out.append(nn.Dropout(dropout))
+    def reshape_heads_to_batch_dim(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+    def reshape_batch_dim_to_heads(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and slice_size > self.sliceable_head_dim:
+            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
+        self._slice_size = slice_size
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        encoder_hidden_states = encoder_hidden_states
+        if self.group_norm is not None:
+            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = self.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = self.reshape_heads_to_batch_dim(query)
+        if self.added_kv_proj_dim is not None:
+            key = self.to_k(hidden_states)
+            value = self.to_v(hidden_states)
+            encoder_hidden_states_key_proj = self.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = self.add_v_proj(encoder_hidden_states)
+            key = self.reshape_heads_to_batch_dim(key)
+            value = self.reshape_heads_to_batch_dim(value)
+            encoder_hidden_states_key_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_key_proj)
+            encoder_hidden_states_value_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_value_proj)
+            key = torch.concat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.concat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+            key = self.to_k(encoder_hidden_states)
+            value = self.to_v(encoder_hidden_states)
+            key = self.reshape_heads_to_batch_dim(key)
+            value = self.reshape_heads_to_batch_dim(value)
+        if attention_mask is not None:
+            if attention_mask.shape[-1] != query.shape[1]:
+                target_length = query.shape[1]
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
+        # attention, what we cannot get enough of
+        if self._use_memory_efficient_attention_xformers:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                hidden_states = self._attention(query, key, value, attention_mask)
+            else:
+                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        return hidden_states
+    def _attention(self, query, key, value, attention_mask=None):
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+        attention_scores = torch.baddbmm(
+            torch.empty(query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device),
+            query,
+            key.transpose(-1, -2),
+            beta=0,
+            alpha=self.scale,
+        )
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+        attention_probs = attention_scores.softmax(dim=-1)
+        # cast back to the original dtype
+        attention_probs = attention_probs.to(value.dtype)
+        # compute attention output
+        hidden_states = torch.bmm(attention_probs, value)
+        # reshape hidden_states
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+    def _sliced_attention(self, query, key, value, sequence_length, dim, attention_mask):
+        batch_size_attention = query.shape[0]
+        hidden_states = torch.zeros(
+            (batch_size_attention, sequence_length, dim // self.heads), device=query.device, dtype=query.dtype
+        )
+        slice_size = self._slice_size if self._slice_size is not None else hidden_states.shape[0]
+        for i in range(hidden_states.shape[0] // slice_size):
+            start_idx = i * slice_size
+            end_idx = (i + 1) * slice_size
+            query_slice = query[start_idx:end_idx]
+            key_slice = key[start_idx:end_idx]
+            if self.upcast_attention:
+                query_slice = query_slice.float()
+                key_slice = key_slice.float()
+            attn_slice = torch.baddbmm(
+                torch.empty(slice_size, query.shape[1], key.shape[1], dtype=query_slice.dtype, device=query.device),
+                query_slice,
+                key_slice.transpose(-1, -2),
+                beta=0,
+                alpha=self.scale,
+            )
+            if attention_mask is not None:
+                attn_slice = attn_slice + attention_mask[start_idx:end_idx]
+            if self.upcast_softmax:
+                attn_slice = attn_slice.float()
+            attn_slice = attn_slice.softmax(dim=-1)
+            # cast back to the original dtype
+            attn_slice = attn_slice.to(value.dtype)
+            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+            hidden_states[start_idx:end_idx] = attn_slice
+        # reshape hidden_states
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+    def _memory_efficient_attention_xformers(self, query, key, value, attention_mask):
+        # TODO attention_mask
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+class SparseCausalAttention(CrossAttention):
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        encoder_hidden_states = encoder_hidden_states
+        if self.group_norm is not None:
+            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = self.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = self.reshape_heads_to_batch_dim(query)
+        if self.added_kv_proj_dim is not None:
+            raise NotImplementedError
+        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        key = self.to_k(encoder_hidden_states)
+        value = self.to_v(encoder_hidden_states)
+        former_frame_index = torch.arange(video_length) - 1
+        former_frame_index[0] = 0
+        key = rearrange(key, "(b f) d c -> b f d c", f=video_length)
+        #key = torch.cat([key[:, [0] * video_length], key[:, [0] * video_length]], dim=2)
+        key = key[:, [0] * video_length]
+        key = rearrange(key, "b f d c -> (b f) d c")
+        value = rearrange(value, "(b f) d c -> b f d c", f=video_length)
+        #value = torch.cat([value[:, [0] * video_length], value[:, [0] * video_length]], dim=2)
+        #value = value[:, former_frame_index]
+        value = rearrange(value, "b f d c -> (b f) d c")
+        key = self.reshape_heads_to_batch_dim(key)
+        value = self.reshape_heads_to_batch_dim(value)
+        if attention_mask is not None:
+            if attention_mask.shape[-1] != query.shape[1]:
+                target_length = query.shape[1]
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
+        # attention, what we cannot get enough of
+        if self._use_memory_efficient_attention_xformers:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                hidden_states = self._attention(query, key, value, attention_mask)
+            else:
+                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        return hidden_states

animatediff/models/motion_module.py ADDED Viewed

	@@ -0,0 +1,555 @@

+# Adapted from https://github.com/guoyww/AnimateDiff
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+import numpy as np
+import torch.nn.functional as F
+from torch import nn
+import torchvision
+from diffusers.utils import BaseOutput
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention import FeedForward
+from einops import rearrange, repeat
+import math
+def zero_module(module):
+    # Zero out the parameters of a module and return it.
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+@dataclass
+class TemporalTransformer3DModelOutput(BaseOutput):
+    sample: torch.FloatTensor
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+def get_motion_module(
+    in_channels,
+    motion_module_type: str,
+    motion_module_kwargs: dict
+):
+    if motion_module_type == "Vanilla":
+        return VanillaTemporalModule(in_channels=in_channels, **motion_module_kwargs,)
+    else:
+        raise ValueError
+class VanillaTemporalModule(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        num_attention_heads                = 8,
+        num_transformer_block              = 2,
+        attention_block_types              =( "Temporal_Self", "Temporal_Self" ),
+        cross_frame_attention_mode         = None,
+        temporal_position_encoding         = False,
+        temporal_position_encoding_max_len = 32,
+        temporal_attention_dim_div         = 1,
+        zero_initialize                    = True,
+    ):
+        super().__init__()
+        self.temporal_transformer = TemporalTransformer3DModel(
+            in_channels=in_channels,
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=in_channels // num_attention_heads // temporal_attention_dim_div,
+            num_layers=num_transformer_block,
+            attention_block_types=attention_block_types,
+            cross_frame_attention_mode=cross_frame_attention_mode,
+            temporal_position_encoding=temporal_position_encoding,
+            temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+        )
+        if zero_initialize:
+            self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)
+    def forward(self, input_tensor, temb, encoder_hidden_states, attention_mask=None, anchor_frame_idx=None):
+        hidden_states = input_tensor
+        hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask)
+        output = hidden_states
+        return output
+class TemporalTransformer3DModel(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        num_attention_heads,
+        attention_head_dim,
+        num_layers,
+        attention_block_types              = ( "Temporal_Self", "Temporal_Self", ),
+        dropout                            = 0.0,
+        norm_num_groups                    = 32,
+        cross_attention_dim                = 1280,
+        activation_fn                      = "geglu",
+        attention_bias                     = False,
+        upcast_attention                   = False,
+        cross_frame_attention_mode         = None,
+        temporal_position_encoding         = False,
+        temporal_position_encoding_max_len = 32,
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                TemporalTransformerBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    attention_block_types=attention_block_types,
+                    dropout=dropout,
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                    cross_frame_attention_mode=cross_frame_attention_mode,
+                    temporal_position_encoding=temporal_position_encoding,
+                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+        video_length = hidden_states.shape[2]
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        batch, channel, height, weight = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        inner_dim = hidden_states.shape[1]
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+        hidden_states = self.proj_in(hidden_states)
+        # Transformer Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, encoder_hidden_states=encoder_hidden_states, video_length=video_length)
+        # output
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+        output = hidden_states + residual
+        output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
+        return output
+class TemporalTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_attention_heads,
+        attention_head_dim,
+        attention_block_types              = ( "Temporal_Self", "Temporal_Self", ),
+        dropout                            = 0.0,
+        norm_num_groups                    = 32,
+        cross_attention_dim                = 768,
+        activation_fn                      = "geglu",
+        attention_bias                     = False,
+        upcast_attention                   = False,
+        cross_frame_attention_mode         = None,
+        temporal_position_encoding         = False,
+        temporal_position_encoding_max_len = 32,
+    ):
+        super().__init__()
+        attention_blocks = []
+        norms = []
+        for block_name in attention_block_types:
+            attention_blocks.append(
+                VersatileAttention(
+                    attention_mode=block_name.split("_")[0],
+                    cross_attention_dim=cross_attention_dim if block_name.endswith("_Cross") else None,
+                    query_dim=dim,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                    cross_frame_attention_mode=cross_frame_attention_mode,
+                    temporal_position_encoding=temporal_position_encoding,
+                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+                )
+            )
+            norms.append(nn.LayerNorm(dim))
+        self.attention_blocks = nn.ModuleList(attention_blocks)
+        self.norms = nn.ModuleList(norms)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+        self.ff_norm = nn.LayerNorm(dim)
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
+        for attention_block, norm in zip(self.attention_blocks, self.norms):
+            norm_hidden_states = norm(hidden_states)
+            hidden_states = attention_block(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states if attention_block.is_cross_attention else None,
+                video_length=video_length,
+            ) + hidden_states
+        hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
+        output = hidden_states
+        return output
+class PositionalEncoding(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        dropout = 0.,
+        max_len = 32
+    ):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
+        pe = torch.zeros(1, max_len, d_model)
+        pe[0, :, 0::2] = torch.sin(position * div_term)
+        pe[0, :, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1)]
+        return self.dropout(x)
+class CrossAttention(nn.Module):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias=False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        added_kv_proj_dim: Optional[int] = None,
+        norm_num_groups: Optional[int] = None,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+        self._slice_size = None
+        self._use_memory_efficient_attention_xformers = False
+        self.added_kv_proj_dim = added_kv_proj_dim
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(num_channels=inner_dim, num_groups=norm_num_groups, eps=1e-5, affine=True)
+        else:
+            self.group_norm = None
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
+        self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(inner_dim, query_dim))
+        self.to_out.append(nn.Dropout(dropout))
+    def reshape_heads_to_batch_dim(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+    def reshape_batch_dim_to_heads(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and slice_size > self.sliceable_head_dim:
+            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
+        self._slice_size = slice_size
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        encoder_hidden_states = encoder_hidden_states
+        if self.group_norm is not None:
+            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = self.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = self.reshape_heads_to_batch_dim(query)
+        if self.added_kv_proj_dim is not None:
+            key = self.to_k(hidden_states)
+            value = self.to_v(hidden_states)
+            encoder_hidden_states_key_proj = self.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = self.add_v_proj(encoder_hidden_states)
+            key = self.reshape_heads_to_batch_dim(key)
+            value = self.reshape_heads_to_batch_dim(value)
+            encoder_hidden_states_key_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_key_proj)
+            encoder_hidden_states_value_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_value_proj)
+            key = torch.concat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.concat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+            key = self.to_k(encoder_hidden_states)
+            value = self.to_v(encoder_hidden_states)
+            key = self.reshape_heads_to_batch_dim(key)
+            value = self.reshape_heads_to_batch_dim(value)
+        if attention_mask is not None:
+            if attention_mask.shape[-1] != query.shape[1]:
+                target_length = query.shape[1]
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
+        # attention, what we cannot get enough of
+        if self._use_memory_efficient_attention_xformers:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                hidden_states = self._attention(query, key, value, attention_mask)
+            else:
+                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        return hidden_states
+    def _attention(self, query, key, value, attention_mask=None):
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+        attention_scores = torch.baddbmm(
+            torch.empty(query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device),
+            query,
+            key.transpose(-1, -2),
+            beta=0,
+            alpha=self.scale,
+        )
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+        attention_probs = attention_scores.softmax(dim=-1)
+        # cast back to the original dtype
+        attention_probs = attention_probs.to(value.dtype)
+        # compute attention output
+        hidden_states = torch.bmm(attention_probs, value)
+        # reshape hidden_states
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+    def _sliced_attention(self, query, key, value, sequence_length, dim, attention_mask):
+        batch_size_attention = query.shape[0]
+        hidden_states = torch.zeros(
+            (batch_size_attention, sequence_length, dim // self.heads), device=query.device, dtype=query.dtype
+        )
+        slice_size = self._slice_size if self._slice_size is not None else hidden_states.shape[0]
+        for i in range(hidden_states.shape[0] // slice_size):
+            start_idx = i * slice_size
+            end_idx = (i + 1) * slice_size
+            query_slice = query[start_idx:end_idx]
+            key_slice = key[start_idx:end_idx]
+            if self.upcast_attention:
+                query_slice = query_slice.float()
+                key_slice = key_slice.float()
+            attn_slice = torch.baddbmm(
+                torch.empty(slice_size, query.shape[1], key.shape[1], dtype=query_slice.dtype, device=query.device),
+                query_slice,
+                key_slice.transpose(-1, -2),
+                beta=0,
+                alpha=self.scale,
+            )
+            if attention_mask is not None:
+                attn_slice = attn_slice + attention_mask[start_idx:end_idx]
+            if self.upcast_softmax:
+                attn_slice = attn_slice.float()
+            attn_slice = attn_slice.softmax(dim=-1)
+            # cast back to the original dtype
+            attn_slice = attn_slice.to(value.dtype)
+            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+            hidden_states[start_idx:end_idx] = attn_slice
+        # reshape hidden_states
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+    def _memory_efficient_attention_xformers(self, query, key, value, attention_mask):
+        # TODO attention_mask
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+class VersatileAttention(CrossAttention):
+    def __init__(
+            self,
+            attention_mode                     = None,
+            cross_frame_attention_mode         = None,
+            temporal_position_encoding         = False,
+            temporal_position_encoding_max_len = 32,
+            *args, **kwargs
+        ):
+        super().__init__(*args, **kwargs)
+        assert attention_mode == "Temporal"
+        self.attention_mode = attention_mode
+        self.is_cross_attention = kwargs["cross_attention_dim"] is not None
+        self.pos_encoder = PositionalEncoding(
+            kwargs["query_dim"],
+            dropout=0.,
+            max_len=temporal_position_encoding_max_len
+        ) if (temporal_position_encoding and attention_mode == "Temporal") else None
+    def extra_repr(self):
+        return f"(Module Info) Attention_Mode: {self.attention_mode}, Is_Cross_Attention: {self.is_cross_attention}"
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        if self.attention_mode == "Temporal":
+            d = hidden_states.shape[1]
+            hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+            if self.pos_encoder is not None:
+                hidden_states = self.pos_encoder(hidden_states)
+            encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b d) n c", d=d) if encoder_hidden_states is not None else encoder_hidden_states
+        else:
+            raise NotImplementedError
+        encoder_hidden_states = encoder_hidden_states
+        if self.group_norm is not None:
+            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = self.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = self.reshape_heads_to_batch_dim(query)
+        if self.added_kv_proj_dim is not None:
+            raise NotImplementedError
+        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        key = self.to_k(encoder_hidden_states)
+        value = self.to_v(encoder_hidden_states)
+        key = self.reshape_heads_to_batch_dim(key)
+        value = self.reshape_heads_to_batch_dim(value)
+        if attention_mask is not None:
+            if attention_mask.shape[-1] != query.shape[1]:
+                target_length = query.shape[1]
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
+        # attention, what we cannot get enough of
+        if self._use_memory_efficient_attention_xformers:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                hidden_states = self._attention(query, key, value, attention_mask)
+            else:
+                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        if self.attention_mode == "Temporal":
+            hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
+        return hidden_states

animatediff/models/resnet.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# Adapted from https://github.com/guoyww/AnimateDiff
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+class InflatedConv3d(nn.Conv2d):
+    def forward(self, x):
+        video_length = x.shape[2]
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = super().forward(x)
+        x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
+        return x
+class Upsample3D(nn.Module):
+    def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        conv = None
+        if use_conv_transpose:
+            raise NotImplementedError
+        elif use_conv:
+            self.conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1)
+    def forward(self, hidden_states, output_size=None):
+        assert hidden_states.shape[1] == self.channels
+        if self.use_conv_transpose:
+            raise NotImplementedError
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(torch.float32)
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+        # if `output_size` is passed we force the interpolation output
+        # size and do not make use of `scale_factor=2`
+        if output_size is None:
+            hidden_states = F.interpolate(hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest")
+        else:
+            hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(dtype)
+        # if self.use_conv:
+        #     if self.name == "conv":
+        #         hidden_states = self.conv(hidden_states)
+        #     else:
+        #         hidden_states = self.Conv2d_0(hidden_states)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class Downsample3D(nn.Module):
+    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+        if use_conv:
+            self.conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
+        else:
+            raise NotImplementedError
+    def forward(self, hidden_states):
+        assert hidden_states.shape[1] == self.channels
+        if self.use_conv and self.padding == 0:
+            raise NotImplementedError
+        assert hidden_states.shape[1] == self.channels
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class ResnetBlock3D(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        non_linearity="swish",
+        time_embedding_norm="default",
+        output_scale_factor=1.0,
+        use_in_shortcut=None,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.output_scale_factor = output_scale_factor
+        if groups_out is None:
+            groups_out = groups
+        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                time_emb_proj_out_channels = out_channels
+            elif self.time_embedding_norm == "scale_shift":
+                time_emb_proj_out_channels = out_channels * 2
+            else:
+                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+            self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)
+        else:
+            self.time_emb_proj = None
+        self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if non_linearity == "swish":
+            self.nonlinearity = lambda x: F.silu(x)
+        elif non_linearity == "mish":
+            self.nonlinearity = Mish()
+        elif non_linearity == "silu":
+            self.nonlinearity = nn.SiLU()
+        self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, input_tensor, temb):
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if temb is not None:
+            temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None]
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        return output_tensor
+class Mish(torch.nn.Module):
+    def forward(self, hidden_states):
+        return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states))

animatediff/models/unet.py ADDED Viewed

	@@ -0,0 +1,572 @@

+# Adapted from https://github.com/guoyww/AnimateDiff
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import os
+import json
+import pdb
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+try:
+    from diffusers.models.cross_attention import AttnProcessor
+except:
+    from diffusers.models.attention_processor import AttnProcessor
+from typing import Dict
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models import ModelMixin
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from .unet_blocks import (
+    CrossAttnDownBlock3D,
+    CrossAttnUpBlock3D,
+    DownBlock3D,
+    UNetMidBlock3DCrossAttn,
+    UpBlock3D,
+    get_down_block,
+    get_up_block,
+)
+from .resnet import InflatedConv3d
+from .motion_module import VersatileAttention
+def zero_module(module):
+    # Zero out the parameters of a module and return it.
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet3DConditionOutput(BaseOutput):
+    sample: torch.FloatTensor
+class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ),
+        mid_block_type: str = "UNetMidBlock3DCrossAttn",
+        up_block_types: Tuple[str] = (
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D"
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        # Additional
+        use_motion_module              = True,
+        motion_module_resolutions      = ( 1,2,4,8 ),
+        motion_module_mid_block        = False,
+        motion_module_decoder_only     = False,
+        motion_module_type             = None,
+        motion_module_kwargs           = {},
+        unet_use_cross_frame_attention = None,
+        unet_use_temporal_attention    = None,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+        # Image to Video Conv
+        # input
+        self.conv_in = InflatedConv3d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            res = 2 ** i
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_motion_module=use_motion_module and (res in motion_module_resolutions) and (not motion_module_decoder_only),
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        if mid_block_type == "UNetMidBlock3DCrossAttn":
+            self.mid_block = UNetMidBlock3DCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_motion_module=use_motion_module and motion_module_mid_block,
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+            )
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the videos
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            res = 2 ** (3 - i)
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_motion_module=use_motion_module and (res in motion_module_resolutions),
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
+        self.conv_act = nn.SiLU()
+        self.conv_out = InflatedConv3d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
+    @property
+    def attn_processors(self) -> Dict[str, AttnProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttnProcessor]):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttnProcessor, Dict[str, AttnProcessor]]):
+        r"""
+        Parameters:
+            `processor (`dict` of `AttnProcessor` or `AttnProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                of **all** `CrossAttention` layers.
+            In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainablae attention processors.:
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                    if not isinstance(processor, dict):
+                        print(f'Set {module}')
+                        module.set_processor(processor)
+                    else:
+                        print(f'Set {module}')
+                        module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_slicable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_slicable_dims(module)
+        num_slicable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_slicable_layers * [1]
+        slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        mask_sample: torch.FloatTensor,
+        masked_sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # image to video b c f h w
+        sample = torch.cat([sample, mask_sample, masked_sample], dim=1).to(sample.device)
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * - 10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        # prepare for ip-adapter
+        if image_embeds is not None:
+            image_embeds = self.encoder_hid_proj(
+                image_embeds).to(encoder_hidden_states.dtype)
+            encoder_hidden_states = torch.cat(
+                [encoder_hidden_states, image_embeds], dim=1)
+        # pre-process
+        # b c  f  h  w
+        # 2 4 16 64 64
+        sample = self.conv_in(sample)
+        # down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states)
+            down_block_res_samples += res_samples
+        # mid
+        sample = self.mid_block(
+            sample, emb, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
+        )
+        # up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size, encoder_hidden_states=encoder_hidden_states,
+                )
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNet3DConditionOutput(sample=sample)
+    @classmethod
+    def from_pretrained_2d(cls, pretrained_model_path, subfolder=None, unet_additional_kwargs=None):
+        if subfolder is not None:
+            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
+        print(f"loaded temporal unet's pretrained weights from {pretrained_model_path} ...")
+        config_file = os.path.join(pretrained_model_path, 'config.json')
+        if not os.path.isfile(config_file):
+            raise RuntimeError(f"{config_file} does not exist")
+        with open(config_file, "r") as f:
+            config = json.load(f)
+        config["_class_name"] = cls.__name__
+        config["down_block_types"] = [
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D"
+        ]
+        config["up_block_types"] = [
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D"
+        ]
+        from diffusers.utils import WEIGHTS_NAME
+        model = cls.from_config(config, **unet_additional_kwargs)
+        model_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)
+        if not os.path.isfile(model_file):
+            raise RuntimeError(f"{model_file} does not exist")
+        state_dict = torch.load(model_file, map_location="cpu")
+        m, u = model.load_state_dict(state_dict, strict=False)
+        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
+        # print(f"### missing keys:\n{m}\n### unexpected keys:\n{u}\n")
+        params = [p.numel() if "temporal" in n else 0 for n, p in model.named_parameters()]
+        print(f"### Temporal Module Parameters: {sum(params) / 1e6} M")
+        return model

animatediff/models/unet_blocks.py ADDED Viewed

	@@ -0,0 +1,733 @@

+# Adapted from https://github.com/guoyww/AnimateDiff
+import torch
+from torch import nn
+from .attention import Transformer3DModel
+from .resnet import Downsample3D, ResnetBlock3D, Upsample3D
+from .motion_module import get_motion_module
+import pdb
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    unet_use_cross_frame_attention=None,
+    unet_use_temporal_attention=None,
+    use_motion_module=None,
+    motion_module_type=None,
+    motion_module_kwargs=None,
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock3D":
+        return DownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+        )
+    elif down_block_type == "CrossAttnDownBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
+        return CrossAttnDownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+            unet_use_temporal_attention=unet_use_temporal_attention,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    unet_use_cross_frame_attention=None,
+    unet_use_temporal_attention=None,
+    use_motion_module=None,
+    motion_module_type=None,
+    motion_module_kwargs=None,
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock3D":
+        return UpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+        )
+    elif up_block_type == "CrossAttnUpBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
+        return CrossAttnUpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+            unet_use_temporal_attention=unet_use_temporal_attention,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlock3DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        upcast_attention=False,
+        unet_use_cross_frame_attention=None,
+        unet_use_temporal_attention=None,
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock3D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        motion_modules = []
+        for _ in range(num_layers):
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    in_channels // attn_num_head_channels,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    upcast_attention=upcast_attention,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=in_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                ) if use_motion_module else None
+            )
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet, motion_module in zip(self.attentions, self.resnets[1:], self.motion_modules):
+            hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+            hidden_states = motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states) if motion_module is not None else hidden_states
+            hidden_states = resnet(hidden_states, temb)
+        return hidden_states
+class CrossAttnDownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        unet_use_cross_frame_attention=None,
+        unet_use_temporal_attention=None,
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        motion_modules = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                ) if use_motion_module else None
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample3D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None):
+        output_states = ()
+        for resnet, attn, motion_module in zip(self.resnets, self.attentions, self.motion_modules):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                )[0]
+                if motion_module is not None:
+                    hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(motion_module), hidden_states.requires_grad_(), temb, encoder_hidden_states)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+                # add motion module
+                hidden_states = motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states) if motion_module is not None else hidden_states
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class DownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+    ):
+        super().__init__()
+        resnets = []
+        motion_modules = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                ) if use_motion_module else None
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample3D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+        output_states = ()
+        for resnet, motion_module in zip(self.resnets, self.motion_modules):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                if motion_module is not None:
+                    hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(motion_module), hidden_states.requires_grad_(), temb, encoder_hidden_states)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                # add motion module
+                hidden_states = motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states) if motion_module is not None else hidden_states
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class CrossAttnUpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        unet_use_cross_frame_attention=None,
+        unet_use_temporal_attention=None,
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        motion_modules = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                ) if use_motion_module else None
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample3D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        upsample_size=None,
+        attention_mask=None,
+    ):
+        for resnet, attn, motion_module in zip(self.resnets, self.attentions, self.motion_modules):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                )[0]
+                if motion_module is not None:
+                    hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(motion_module), hidden_states.requires_grad_(), temb, encoder_hidden_states)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+                # add motion module
+                hidden_states = motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states) if motion_module is not None else hidden_states
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class UpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+    ):
+        super().__init__()
+        resnets = []
+        motion_modules = []
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                ) if use_motion_module else None
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample3D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, encoder_hidden_states=None,):
+        for resnet, motion_module in zip(self.resnets, self.motion_modules):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                if motion_module is not None:
+                    hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(motion_module), hidden_states.requires_grad_(), temb, encoder_hidden_states)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = motion_module(hidden_states, temb, encoder_hidden_states=encoder_hidden_states) if motion_module is not None else hidden_states
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states

animatediff/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .i2v_pipeline import I2VPipeline
+from .pipeline_animation import AnimationPipeline
+from .validation_pipeline import ValidationPipeline
+__all__ = ['I2VPipeline', 'AnimationPipeline', 'ValidationPipeline']

animatediff/pipelines/i2v_pipeline.py ADDED Viewed

	@@ -0,0 +1,775 @@

+# Adapted from https://github.com/showlab/Tune-A-Video/blob/main/tuneavideo/pipelines/pipeline_tuneavideo.py
+import inspect
+import os.path as osp
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import FrozenDict
+from diffusers.loaders import IPAdapterMixin
+from diffusers.models import AutoencoderKL
+from diffusers.pipelines import DiffusionPipeline
+from diffusers.schedulers import (DDIMScheduler, DPMSolverMultistepScheduler,
+                                  EulerAncestralDiscreteScheduler,
+                                  EulerDiscreteScheduler, LMSDiscreteScheduler,
+                                  PNDMScheduler)
+from diffusers.utils import (BaseOutput, deprecate, is_accelerate_available,
+                             logging)
+from diffusers.utils.import_utils import is_xformers_available
+from einops import rearrange
+from omegaconf import OmegaConf
+from packaging import version
+from safetensors import safe_open
+from tqdm import tqdm
+from transformers import (CLIPImageProcessor, CLIPTextModel, CLIPTokenizer,
+                          CLIPVisionModelWithProjection)
+from animatediff.models.resnet import InflatedConv3d
+from animatediff.models.unet import UNet3DConditionModel
+from animatediff.utils.convert_from_ckpt import (convert_ldm_clip_checkpoint,
+                                                 convert_ldm_unet_checkpoint,
+                                                 convert_ldm_vae_checkpoint)
+from animatediff.utils.convert_lora_safetensor_to_diffusers import \
+    convert_lora_model_level
+from animatediff.utils.util import prepare_mask_coef_by_statistics
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+DEFAULT_N_PROMPT = ('wrong white balance, dark, sketches,worst quality,'
+                    'low quality, deformed, distorted, disfigured, bad eyes, '
+                    'wrong lips,weird mouth, bad teeth, mutated hands and fingers, '
+                    'bad anatomy,wrong anatomy, amputation, extra limb, '
+                    'missing limb, floating,limbs, disconnected limbs, mutation, '
+                    'ugly, disgusting, bad_pictures, negative_hand-neg')
+@dataclass
+class AnimationPipelineOutput(BaseOutput):
+    videos: Union[torch.Tensor, np.ndarray]
+class I2VPipeline(DiffusionPipeline, IPAdapterMixin):
+    _optional_components = []
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet3DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0",
+                      deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0",
+                      deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(
+            unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0",
+                      deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (
+            len(self.vae.config.block_out_channels) - 1)
+        self.use_ip_adapter = False
+        self.st_motion = None
+    def set_st_motion(self, st_motion: List):
+        """Set style transfer motion."""
+        self.st_motion = st_motion
+    @classmethod
+    def build_pipeline(cls,
+                       base_cfg,
+                       base_model: str,
+                       unet_path: str,
+                       dreambooth_path: Optional[str] = None,
+                       lora_path: Optional[str] = None,
+                       lora_alpha: int = 0,
+                       vae_path: Optional[str] = None,
+                       ip_adapter_path: Optional[str] = None,
+                       ip_adapter_scale: float = 0.0,
+                       only_load_vae_decoder: bool = False,
+                       only_load_vae_encoder: bool = False) -> 'I2VPipeline':
+        """Method to build pipeline in a faster way~
+        Args:
+            base_cfg: The config to build model
+            base_mode: The model id to initialize StableDiffusion
+            unet_path: Path for i2v unet
+            dreambooth_path: path for dreambooth model
+            lora_path: path for lora model
+            lora_alpha: value for lora scale
+            only_load_vae_decoder: Only load VAE decoder from dreambooth / VAE ckpt
+                and maitain encoder as original.
+        """
+        # build unet
+        unet = UNet3DConditionModel.from_pretrained_2d(
+            base_model, subfolder="unet",
+            unet_additional_kwargs=OmegaConf.to_container(
+                base_cfg.unet_additional_kwargs))
+        old_weights = unet.conv_in.weight
+        old_bias = unet.conv_in.bias
+        new_conv1 = InflatedConv3d(
+            9, old_weights.shape[0],
+            kernel_size=unet.conv_in.kernel_size,
+            stride=unet.conv_in.stride,
+            padding=unet.conv_in.padding,
+            bias=True if old_bias is not None else False)
+        param = torch.zeros((320, 5, 3, 3), requires_grad=True)
+        new_conv1.weight = torch.nn.Parameter(
+            torch.cat((old_weights, param), dim=1))
+        if old_bias is not None:
+            new_conv1.bias = old_bias
+        unet.conv_in = new_conv1
+        unet.config["in_channels"] = 9
+        unet_ckpt = torch.load(unet_path, map_location='cpu')
+        # filter unet ckpt, only load motion module and conv_inv
+        unet_ckpt = {k: v for k, v in unet_ckpt.items()
+                     if 'motion_module' in k or 'conv_in' in k}
+        print(f'Unet prefix: ')
+        print(set([k.split('.')[0] for k in unet_ckpt.keys()]))
+        unet.load_state_dict(unet_ckpt, strict=False)
+        # load vae, tokenizer, text encoder
+        vae = AutoencoderKL.from_pretrained(base_model, subfolder="vae")
+        tokenizer = CLIPTokenizer.from_pretrained(
+            base_model, subfolder="tokenizer")
+        text_encoder = CLIPTextModel.from_pretrained(
+            base_model, subfolder="text_encoder")
+        noise_scheduler = DDIMScheduler(
+            **OmegaConf.to_container(base_cfg.noise_scheduler_kwargs))
+        if dreambooth_path and dreambooth_path.upper() != 'NONE':
+            print(" >>> Begin loading DreamBooth >>>")
+            base_model_state_dict = {}
+            with safe_open(dreambooth_path, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    base_model_state_dict[key] = f.get_tensor(key)
+            # load unet
+            converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+                base_model_state_dict, unet.config)
+            old_value = converted_unet_checkpoint['conv_in.weight']
+            new_param = unet_ckpt['conv_in.weight'][:, 4:, :, :].clone().cpu()
+            new_value = torch.nn.Parameter(
+                torch.cat((old_value, new_param), dim=1))
+            converted_unet_checkpoint['conv_in.weight'] = new_value
+            unet.load_state_dict(converted_unet_checkpoint, strict=False)
+            # load vae
+            converted_vae_checkpoint = convert_ldm_vae_checkpoint(
+                base_model_state_dict, vae.config,
+                only_decoder=only_load_vae_decoder,
+                only_encoder=only_load_vae_encoder,)
+            need_strict = not (only_load_vae_decoder or only_load_vae_encoder)
+            vae.load_state_dict(converted_vae_checkpoint, strict=need_strict)
+            print('Prefix in loaded VAE checkpoint: ')
+            print(set([k.split('.')[0]
+                  for k in converted_vae_checkpoint.keys()]))
+            # load text encoder
+            text_encoder_checkpoint = convert_ldm_clip_checkpoint(
+                base_model_state_dict)
+            if text_encoder_checkpoint:
+                text_encoder.load_state_dict(text_encoder_checkpoint)
+            print(" <<< Loaded DreamBooth        <<<")
+        if vae_path:
+            print(' >>> Begin loading VAE >>>')
+            vae_state_dict = {}
+            if vae_path.endswith('safetensors'):
+                with safe_open(vae_path, framework="pt", device="cpu") as f:
+                    for key in f.keys():
+                        vae_state_dict[key] = f.get_tensor(key)
+            elif vae_path.endswith('ckpt') or vae_path.endswith('pt'):
+                vae_state_dict = torch.load(vae_path, map_location='cpu')
+            if 'state_dict' in vae_state_dict:
+                vae_state_dict = vae_state_dict['state_dict']
+            vae_state_dict = {
+                f'first_stage_model.{k}': v for k, v in vae_state_dict.items()}
+            converted_vae_checkpoint = convert_ldm_vae_checkpoint(
+                vae_state_dict, vae.config,
+                only_decoder=only_load_vae_decoder,
+                only_encoder=only_load_vae_encoder,)
+            print('Prefix in loaded VAE checkpoint: ')
+            print(set([k.split('.')[0]
+                  for k in converted_vae_checkpoint.keys()]))
+            need_strict = not (only_load_vae_decoder or only_load_vae_encoder)
+            vae.load_state_dict(converted_vae_checkpoint, strict=need_strict)
+            print(" <<< Loaded VAE        <<<")
+        if lora_path:
+            print(" >>> Begin loading LoRA >>>")
+            lora_dict = {}
+            with safe_open(lora_path, framework='pt', device='cpu') as file:
+                for k in file.keys():
+                    lora_dict[k] = file.get_tensor(k)
+            unet, text_encoder = convert_lora_model_level(
+                lora_dict, unet, text_encoder, alpha=lora_alpha)
+            print(" <<< Loaded LoRA        <<<")
+        # move model to device
+        device = torch.device('cuda')
+        unet_dtype = torch.float16
+        tenc_dtype = torch.float16
+        vae_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
+        unet = unet.to(device=device, dtype=unet_dtype)
+        text_encoder = text_encoder.to(device=device, dtype=tenc_dtype)
+        vae = vae.to(device=device, dtype=vae_dtype)
+        print(f'Set Unet to {unet_dtype}')
+        print(f'Set text encoder to {tenc_dtype}')
+        print(f'Set vae to {vae_dtype}')
+        if is_xformers_available():
+            unet.enable_xformers_memory_efficient_attention()
+        pipeline = cls(unet=unet,
+                       vae=vae,
+                       tokenizer=tokenizer,
+                       text_encoder=text_encoder,
+                       scheduler=noise_scheduler)
+        # ip_adapter_path = 'h94/IP-Adapter'
+        if ip_adapter_path and ip_adapter_scale > 0:
+            ip_adapter_name = 'ip-adapter_sd15.bin'
+            # only online repo need subfolder
+            if not osp.isdir(ip_adapter_path):
+                subfolder = 'models'
+            else:
+                subfolder = ''
+            pipeline.load_ip_adapter(
+                ip_adapter_path, subfolder, ip_adapter_name)
+            pipeline.set_ip_adapter_scale(ip_adapter_scale)
+            pipeline.use_ip_adapter = True
+            print(f'Load IP-Adapter, scale: {ip_adapter_scale}')
+        # text_inversion_path = './models/TextualInversion/easynegative.safetensors'
+        # if text_inversion_path:
+        #     pipeline.load_textual_inversion(text_inversion_path, 'easynegative')
+        return pipeline
+    def enable_vae_slicing(self):
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        self.vae.disable_slicing()
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError(
+                "Please install accelerate via `pip install accelerate`")
+        device = torch.device(f"cuda:{gpu_id}")
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+    @property
+    def _execution_device(self):
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def _encode_prompt(self, prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(
+            prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(
+                untruncated_ids[:, self.tokenizer.model_max_length - 1: -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_videos_per_prompt, 1)
+        text_embeddings = text_embeddings.view(
+            bs_embed * num_videos_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(
+                1, num_videos_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(
+                batch_size * num_videos_per_prompt, seq_len, -1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def decode_latents(self, latents):
+        video_length = latents.shape[2]
+        latents = 1 / 0.18215 * latents
+        latents = rearrange(latents, "b c f h w -> (b f) c h w")
+        # video = self.vae.decode(latents).sample
+        video = []
+        for frame_idx in tqdm(range(latents.shape[0])):
+            video.append(self.vae.decode(
+                latents[frame_idx:frame_idx+1]).sample)
+        video = torch.cat(video)
+        video = rearrange(video, "(b f) c h w -> b c f h w", f=video_length)
+        video = (video / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        video = video.cpu().float().numpy()
+        return video
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(
+            self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(
+                callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(
+            int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+        return timesteps, num_inference_steps - t_start
+    def prepare_latents(self, add_noise_time_step, batch_size, num_channels_latents, video_length, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, video_length, height //
+                 self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            rand_device = "cpu" if device.type == "mps" else device
+            if isinstance(generator, list):
+                shape = shape
+                # shape = (1,) + shape[1:]
+                latents = [
+                    torch.randn(
+                        shape, generator=generator[i], device=rand_device, dtype=dtype)
+                    for i in range(batch_size)
+                ]
+                latents = torch.cat(latents, dim=0).to(device)
+            else:
+                latents = torch.randn(
+                    shape, generator=generator, device=rand_device, dtype=dtype).to(device)
+        else:
+            if latents.shape != shape:
+                raise ValueError(
+                    f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        return latents
+    def encode_image(self, image, device, num_images_per_prompt):
+        """Encode image for ip-adapter. Copied from
+        https://github.com/huggingface/diffusers/blob/f9487783228cd500a21555da3346db40e8f05992/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L492-L514  # noqa
+        """
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(
+                image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(
+            num_images_per_prompt, dim=0)
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: np.ndarray,
+        prompt: Union[str, List[str]],
+        video_length: Optional[int],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        global_inf_num: int = 0,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator,
+                                  List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        callback: Optional[Callable[[
+            int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cond_frame: int = 0,
+        mask_sim_template_idx: int = 0,
+        ip_adapter_scale: float = 0,
+        strength: float = 1,
+        is_real_img: bool = False,
+        progress_fn=None,
+        **kwargs,
+    ):
+        # Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        assert strength > 0 and strength <= 1, (
+            f'"strength" for img2vid must in (0, 1]. But receive {strength}.')
+        # Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+        # Define call parameters
+        # batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        batch_size = 1
+        if latents is not None:
+            batch_size = latents.shape[0]
+        if isinstance(prompt, list):
+            batch_size = len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # Encode input prompt
+        prompt = prompt if isinstance(prompt, list) else [prompt] * batch_size
+        if negative_prompt is None:
+            negative_prompt = DEFAULT_N_PROMPT
+        negative_prompt = negative_prompt if isinstance(negative_prompt, list) else [
+            negative_prompt] * batch_size
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+        # Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size)
+        # Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            latent_timestep,
+            batch_size * num_videos_per_prompt,
+            4,
+            video_length,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        shape = (batch_size, num_channels_latents, video_length, height //
+                 self.vae_scale_factor, width // self.vae_scale_factor)
+        raw_image = image.copy()
+        image = torch.from_numpy(image)[None, ...].permute(0, 3, 1, 2)
+        image = image / 255  # [0, 1]
+        image = image * 2 - 1   # [-1, 1]
+        image = image.to(device=device, dtype=self.vae.dtype)
+        if isinstance(generator, list):
+            image_latent = [
+                self.vae.encode(image[k: k + 1]).latent_dist.sample(generator[k]) for k in range(batch_size)
+            ]
+            image_latent = torch.cat(image_latent, dim=0)
+        else:
+            image_latent = self.vae.encode(image).latent_dist.sample(generator)
+        image_latent = image_latent.to(device=device, dtype=self.unet.dtype)
+        image_latent = torch.nn.functional.interpolate(
+            image_latent, size=[shape[-2], shape[-1]])
+        image_latent_padding = image_latent.clone() * 0.18215
+        mask = torch.zeros((shape[0], 1, shape[2], shape[3], shape[4])).to(
+            device=device, dtype=self.unet.dtype)
+        # prepare mask
+        # NOTE: pass specific st_motion for real image style transfer
+        if mask_sim_template_idx == -1 and is_real_img:
+            mask_coef = prepare_mask_coef_by_statistics(
+                video_length, cond_frame, mask_sim_template_idx, self.st_motion)
+        else:
+            mask_coef = prepare_mask_coef_by_statistics(
+                video_length, cond_frame, mask_sim_template_idx)
+        masked_image = torch.zeros(shape[0], 4, shape[2], shape[3], shape[4]).to(
+            device=device, dtype=self.unet.dtype)
+        for f in range(video_length):
+            mask[:, :, f, :, :] = mask_coef[f]
+            masked_image[:, :, f, :, :] = image_latent_padding.clone()
+        # Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image = torch.cat(
+            [masked_image] * 2) if do_classifier_free_guidance else masked_image
+        # Denoising loop
+        num_warmup_steps = len(timesteps) - \
+                               num_inference_steps * self.scheduler.order
+        # prepare for ip-adapter
+        if self.use_ip_adapter:
+            image_embeds, neg_image_embeds = self.encode_image(
+                raw_image, device, num_videos_per_prompt)
+            image_embeds = torch.cat([neg_image_embeds, image_embeds])
+            image_embeds = image_embeds.to(device, self.unet.dtype)
+            self.set_ip_adapter_scale(ip_adapter_scale)
+            print(f'Set IP-Adapter Scale as {ip_adapter_scale}')
+        else:
+            image_embeds = None
+        # prepare for latents if strength < 1, add convert gaussian latent to masked_img and add noise
+        if strength < 1:
+            noise = torch.randn_like(latents)
+            latents = self.scheduler.add_noise(
+                masked_image[0], noise, timesteps[0])
+        if progress_fn is None:
+            progress_bar = tqdm(timesteps)
+            terminal_pbar = None
+        else:
+            progress_bar = progress_fn.tqdm(timesteps)
+            terminal_pbar = tqdm(total=len(timesteps))
+        for i, t in enumerate(progress_bar):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat(
+                [latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(
+                latent_model_input, t)
+            # predict the noise residual
+            noise_pred = self.unet(
+                latent_model_input,
+                mask,
+                masked_image,
+                t,
+                encoder_hidden_states=text_embeddings,
+                image_embeds=image_embeds
+            )['sample']
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * \
+                    (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                if callback is not None and i % callback_steps == 0:
+                    callback(i, t, latents)
+            if terminal_pbar is not None:
+                terminal_pbar.update(1)
+        # Post-processing
+        video = self.decode_latents(latents.to(device, dtype=self.vae.dtype))
+        # Convert to tensor
+        if output_type == "tensor":
+            video = torch.from_numpy(video)
+        if not return_dict:
+            return video
+        return AnimationPipelineOutput(videos=video)

animatediff/pipelines/pipeline_animation.py ADDED Viewed

	@@ -0,0 +1,446 @@

+# Adapted from https://github.com/showlab/Tune-A-Video/blob/main/tuneavideo/pipelines/pipeline_tuneavideo.py
+import inspect
+from typing import Callable, List, Optional, Union
+from dataclasses import dataclass
+import numpy as np
+import torch
+from tqdm import tqdm
+from diffusers.utils import is_accelerate_available
+from packaging import version
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL
+from diffusers.pipelines import DiffusionPipeline
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import deprecate, logging, BaseOutput
+from einops import rearrange
+from ..models.unet import UNet3DConditionModel
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class AnimationPipelineOutput(BaseOutput):
+    videos: Union[torch.Tensor, np.ndarray]
+class AnimationPipeline(DiffusionPipeline):
+    _optional_components = []
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet3DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+    def enable_vae_slicing(self):
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        self.vae.disable_slicing()
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+        device = torch.device(f"cuda:{gpu_id}")
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+    @property
+    def _execution_device(self):
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def _encode_prompt(self, prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_videos_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_videos_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_videos_per_prompt, seq_len, -1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def decode_latents(self, latents):
+        video_length = latents.shape[2]
+        latents = 1 / 0.18215 * latents
+        latents = rearrange(latents, "b c f h w -> (b f) c h w")
+        # video = self.vae.decode(latents).sample
+        video = []
+        for frame_idx in tqdm(range(latents.shape[0])):
+            video.append(self.vae.decode(latents[frame_idx:frame_idx+1]).sample)
+        video = torch.cat(video)
+        video = rearrange(video, "(b f) c h w -> b c f h w", f=video_length)
+        video = (video / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        video = video.cpu().float().numpy()
+        return video
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+    def prepare_latents(self, batch_size, num_channels_latents, video_length, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, video_length, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            rand_device = "cpu" if device.type == "mps" else device
+            if isinstance(generator, list):
+                shape = shape
+                # shape = (1,) + shape[1:]
+                latents = [
+                    torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
+                    for i in range(batch_size)
+                ]
+                latents = torch.cat(latents, dim=0).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        video_length: Optional[int],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        # Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+        # Define call parameters
+        # batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        batch_size = 1
+        if latents is not None:
+            batch_size = latents.shape[0]
+        if isinstance(prompt, list):
+            batch_size = len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # Encode input prompt
+        prompt = prompt if isinstance(prompt, list) else [prompt] * batch_size
+        if negative_prompt is not None:
+            negative_prompt = negative_prompt if isinstance(negative_prompt, list) else [negative_prompt] * batch_size
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+        # Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            video_length,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents_dtype = latents.dtype
+        # Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # import os
+                # import cv2 as cv
+                # feature_path = f'feature/timestep{t}'
+                # if not os.path.exists(feature_path):
+                #     os.makedirs(feature_path)
+                # # latents B C F H W -> B C H W -> B H W -> H W
+                # #
+                # features = latents.sum(dim=1, keepdim=False)
+                # features = [features[:,frame,:,:] for frame in range(video_length)]
+                # features = [feature.squeeze(0) for feature in features]
+                # features = [feature.detach().cpu().numpy() for feature in features]
+                # features = [((feature - feature.min()) / (feature.max() - feature.min()) * 255) for feature in features]
+                # for feature_num in range(len(features)):
+                #     cv.imwrite(f'{feature_path}/{feature_num}.jpg', features[feature_num])
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample.to(dtype=latents_dtype)
+                # noise_pred = []
+                # import pdb
+                # pdb.set_trace()
+                # for batch_idx in range(latent_model_input.shape[0]):
+                #     noise_pred_single = self.unet(latent_model_input[batch_idx:batch_idx+1], t, encoder_hidden_states=text_embeddings[batch_idx:batch_idx+1]).sample.to(dtype=latents_dtype)
+                #     noise_pred.append(noise_pred_single)
+                # noise_pred = torch.cat(noise_pred)
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # Post-processing
+        video = self.decode_latents(latents)
+        # Convert to tensor
+        if output_type == "tensor":
+            video = torch.from_numpy(video)
+        if not return_dict:
+            return video
+        return AnimationPipelineOutput(videos=video)

animatediff/pipelines/validation_pipeline.py ADDED Viewed

	@@ -0,0 +1,504 @@

+# Adapted from https://github.com/showlab/Tune-A-Video/blob/main/tuneavideo/pipelines/pipeline_tuneavideo.py
+import inspect
+from typing import Callable, List, Optional, Union
+from dataclasses import dataclass
+import random
+import argparse
+import numpy as np
+import torch
+from tqdm import tqdm
+from omegaconf import OmegaConf
+from diffusers.utils import is_accelerate_available
+from packaging import version
+from transformers import CLIPTextModel, CLIPTokenizer
+import os
+from safetensors import safe_open
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL
+from diffusers.pipelines import DiffusionPipeline
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import deprecate, logging, BaseOutput
+from einops import rearrange
+from animatediff.models.unet import UNet3DConditionModel
+from animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint
+from animatediff.utils.convert_lora_safetensor_to_diffusers import convert_lora
+from animatediff.utils.util import prepare_mask_coef, save_videos_grid
+from animatediff.models.resnet import InflatedConv3d
+from PIL import Image
+PIL_INTERPOLATION = {
+        "linear": Image.Resampling.BILINEAR,
+        "bilinear": Image.Resampling.BILINEAR,
+        "bicubic": Image.Resampling.BICUBIC,
+        "lanczos": Image.Resampling.LANCZOS,
+        "nearest": Image.Resampling.NEAREST,
+    }
+def preprocess_image(image):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, Image.Image):
+        image = [image]
+    if isinstance(image[0], Image.Image):
+        w, h = image[0].size
+        w, h = map(lambda x: x - x % 8, (w, h))  # resize to integer multiple of 8
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        if len(image.shape) == 3:
+            image = image.reshape(image.shape[0], image.shape[1], image.shape[2], 1)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class AnimationPipelineOutput(BaseOutput):
+    videos: Union[torch.Tensor, np.ndarray]
+class ValidationPipeline(DiffusionPipeline):
+    _optional_components = []
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet3DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+    def enable_vae_slicing(self):
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        self.vae.disable_slicing()
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+        device = torch.device(f"cuda:{gpu_id}")
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+    @property
+    def _execution_device(self):
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def _encode_prompt(self, prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_videos_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_videos_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_videos_per_prompt, seq_len, -1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def decode_latents(self, latents):
+        video_length = latents.shape[2]
+        latents = 1 / 0.18215 * latents
+        latents = rearrange(latents, "b c f h w -> (b f) c h w")
+        # video = self.vae.decode(latents).sample
+        video = []
+        for frame_idx in tqdm(range(latents.shape[0])):
+            video.append(self.vae.decode(latents[frame_idx:frame_idx+1]).sample)
+        video = torch.cat(video)
+        video = rearrange(video, "(b f) c h w -> b c f h w", f=video_length)
+        video = (video / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        video = video.cpu().float().numpy()
+        return video
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+    def prepare_latents(self, batch_size, num_channels_latents, video_length, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, video_length, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            rand_device = "cpu" if device.type == "mps" else device
+            if isinstance(generator, list):
+                shape = shape
+                # shape = (1,) + shape[1:]
+                latents = [
+                    torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
+                    for i in range(batch_size)
+                ]
+                latents = torch.cat(latents, dim=0).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        use_image: bool,
+        video_length: Optional[int],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        # Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+        # Define call parameters
+        # batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        batch_size = 1
+        if latents is not None:
+            batch_size = latents.shape[0]
+        if isinstance(prompt, list):
+            batch_size = len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # Encode input prompt
+        prompt = prompt if isinstance(prompt, list) else [prompt] * batch_size
+        if negative_prompt is not None:
+            negative_prompt = negative_prompt if isinstance(negative_prompt, list) else [negative_prompt] * batch_size
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+        # Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            video_length,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents_dtype = latents.dtype
+        if use_image != False:
+            shape = (batch_size, num_channels_latents, video_length, height // self.vae_scale_factor, width // self.vae_scale_factor)
+            image = Image.open(f'test_image/init_image{use_image}.png').convert('RGB')
+            image = preprocess_image(image).to(device)
+            if isinstance(generator, list):
+                image_latent = [
+                    self.vae.encode(image[k : k + 1]).latent_dist.sample(generator[k]) for k in range(batch_size)
+                ]
+                image_latent = torch.cat(image_latent, dim=0).to(device=device)
+            else:
+                image_latent = self.vae.encode(image).latent_dist.sample(generator).to(device=device)
+            image_latent         = torch.nn.functional.interpolate(image_latent, size=[shape[-2], shape[-1]])
+            image_latent_padding = image_latent.clone() * 0.18215
+            mask                 = torch.zeros((shape[0], 1, shape[2], shape[3], shape[4])).to(device)
+            mask_coef            = prepare_mask_coef(video_length, 0, kwargs['mask_sim_range'])
+            add_noise = torch.randn(shape).to(device)
+            masked_image = torch.zeros(shape).to(device)
+            for f in range(video_length):
+                mask[:,:,f,:,:]         = mask_coef[f]
+                masked_image[:,:,f,:,:] = image_latent_padding.clone()
+            mask         = mask.to(device)
+        else:
+            shape = (batch_size, num_channels_latents, video_length, height // self.vae_scale_factor, width // self.vae_scale_factor)
+            add_noise    = torch.zeros_like(latents).to(device)
+            masked_image = add_noise
+            mask         = torch.zeros((shape[0], 1, shape[2], shape[3], shape[4])).to(device)
+        # Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image = torch.cat([masked_image] * 2) if do_classifier_free_guidance else masked_image
+        # Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, mask, masked_image, t, encoder_hidden_states=text_embeddings).sample.to(dtype=latents_dtype)
+                # noise_pred = []
+                # import pdb
+                # pdb.set_trace()
+                # for batch_idx in range(latent_model_input.shape[0]):
+                #     noise_pred_single = self.unet(latent_model_input[batch_idx:batch_idx+1], t, encoder_hidden_states=text_embeddings[batch_idx:batch_idx+1]).sample.to(dtype=latents_dtype)
+                #     noise_pred.append(noise_pred_single)
+                # noise_pred = torch.cat(noise_pred)
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # Post-processing
+        video = self.decode_latents(latents)
+        # Convert to tensor
+        if output_type == "tensor":
+            video = torch.from_numpy(video)
+        if not return_dict:
+            return video
+        return AnimationPipelineOutput(videos=video)

animatediff/utils/convert_from_ckpt.py ADDED Viewed

	@@ -0,0 +1,964 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the Stable Diffusion checkpoints."""
+import re
+from io import BytesIO
+from typing import Optional
+import requests
+import torch
+from transformers import (
+    AutoFeatureExtractor,
+    BertTokenizerFast,
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+from diffusers.models import (
+    AutoencoderKL,
+    PriorTransformer,
+    UNet2DConditionModel,
+)
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UnCLIPScheduler,
+)
+from diffusers.utils.import_utils import BACKENDS_MAPPING
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+        new_item = new_item.replace("q.weight", "to_q.weight")
+        new_item = new_item.replace("q.bias", "to_q.bias")
+        new_item = new_item.replace("k.weight", "to_k.weight")
+        new_item = new_item.replace("k.bias", "to_k.bias")
+        new_item = new_item.replace("v.weight", "to_v.weight")
+        new_item = new_item.replace("v.bias", "to_v.bias")
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+    for path in paths:
+        new_path = path["new"]
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        elif 'to_out.0.weight' in new_path:
+            checkpoint[new_path] = old_checkpoint[path['old']].squeeze()
+        elif any([qkv in new_path for qkv in ['to_q', 'to_k', 'to_v']]):
+            checkpoint[new_path] = old_checkpoint[path['old']].squeeze()
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    if controlnet:
+        unet_params = original_config.model.params.control_stage_config.params
+    else:
+        unet_params = original_config.model.params.unet_config.params
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
+    use_linear_projection = (
+        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim = [5, 10, 20, 20]
+    class_embed_type = None
+    projection_class_embeddings_input_dim = None
+    if "num_classes" in unet_params:
+        if unet_params.num_classes == "sequential":
+            class_embed_type = "projection"
+            assert "adm_in_channels" in unet_params
+            projection_class_embeddings_input_dim = unet_params.adm_in_channels
+        else:
+            raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}")
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params.in_channels,
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params.num_res_blocks,
+        "cross_attention_dim": unet_params.context_dim,
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+    }
+    if not controlnet:
+        config["out_channels"] = unet_params.out_channels
+        config["up_block_types"] = tuple(up_block_types)
+    return config
+def create_vae_diffusers_config(original_config, image_size: int):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    _ = original_config.model.params.first_stage_config.params.embed_dim
+    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params.in_channels,
+        "out_channels": vae_params.out_ch,
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "latent_channels": vae_params.z_channels,
+        "layers_per_block": vae_params.num_res_blocks,
+    }
+    return config
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config.model.params.timesteps,
+        beta_start=original_config.model.params.linear_start,
+        beta_end=original_config.model.params.linear_end,
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+def create_ldm_bert_config(original_config):
+    bert_params = original_config.model.parms.cond_stage_config.params
+    config = LDMBertConfig(
+        d_model=bert_params.n_embed,
+        encoder_layers=bert_params.n_layer,
+        encoder_ffn_dim=bert_params.n_embed * 4,
+    )
+    return config
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, controlnet=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+    if controlnet:
+        unet_key = "control_model."
+    else:
+        unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+    new_checkpoint = {}
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+    if config["class_embed_type"] is None:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+    if not controlnet:
+        new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+        new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+        new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+        new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+    if controlnet:
+        # conditioning embedding
+        orig_index = 0
+        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+        orig_index += 2
+        diffusers_index = 0
+        while diffusers_index < 6:
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.weight"
+            )
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.bias"
+            )
+            diffusers_index += 1
+            orig_index += 2
+        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+        # down blocks
+        for i in range(num_input_blocks):
+            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
+        # mid block
+        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
+    return new_checkpoint
+def convert_ldm_vae_checkpoint(checkpoint, config, only_decoder=False, only_encoder=False):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+    new_checkpoint = {}
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    if only_decoder:
+        new_checkpoint = {k: v for k, v in new_checkpoint.items() if k.startswith('decoder') or k.startswith('post_quant')}
+    elif only_encoder:
+        new_checkpoint = {k: v for k, v in new_checkpoint.items() if k.startswith('encoder') or k.startswith('quant')}
+    return new_checkpoint
+def convert_ldm_bert_checkpoint(checkpoint, config):
+    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
+        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
+        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
+        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
+        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
+        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
+    def _copy_linear(hf_linear, pt_linear):
+        hf_linear.weight = pt_linear.weight
+        hf_linear.bias = pt_linear.bias
+    def _copy_layer(hf_layer, pt_layer):
+        # copy layer norms
+        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
+        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
+        # copy attn
+        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
+        # copy MLP
+        pt_mlp = pt_layer[1][1]
+        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
+        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
+    def _copy_layers(hf_layers, pt_layers):
+        for i, hf_layer in enumerate(hf_layers):
+            if i != 0:
+                i += i
+            pt_layer = pt_layers[i : i + 2]
+            _copy_layer(hf_layer, pt_layer)
+    hf_model = LDMBertModel(config).eval()
+    # copy  embeds
+    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
+    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
+    # copy layer norm
+    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
+    # copy hidden layers
+    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
+    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
+    return hf_model
+def convert_ldm_clip_checkpoint(checkpoint):
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+    return text_model_dict
+textenc_conversion_lst = [
+    ("cond_stage_model.model.positional_embedding", "text_model.embeddings.position_embedding.weight"),
+    ("cond_stage_model.model.token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
+    ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"),
+]
+textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
+textenc_transformer_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
+def convert_paint_by_example_checkpoint(checkpoint):
+    config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14")
+    model = PaintByExampleImageEncoder(config)
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+    # load clip vision
+    model.model.load_state_dict(text_model_dict)
+    # load mapper
+    keys_mapper = {
+        k[len("cond_stage_model.mapper.res") :]: v
+        for k, v in checkpoint.items()
+        if k.startswith("cond_stage_model.mapper")
+    }
+    MAPPING = {
+        "attn.c_qkv": ["attn1.to_q", "attn1.to_k", "attn1.to_v"],
+        "attn.c_proj": ["attn1.to_out.0"],
+        "ln_1": ["norm1"],
+        "ln_2": ["norm3"],
+        "mlp.c_fc": ["ff.net.0.proj"],
+        "mlp.c_proj": ["ff.net.2"],
+    }
+    mapped_weights = {}
+    for key, value in keys_mapper.items():
+        prefix = key[: len("blocks.i")]
+        suffix = key.split(prefix)[-1].split(".")[-1]
+        name = key.split(prefix)[-1].split(suffix)[0][1:-1]
+        mapped_names = MAPPING[name]
+        num_splits = len(mapped_names)
+        for i, mapped_name in enumerate(mapped_names):
+            new_name = ".".join([prefix, mapped_name, suffix])
+            shape = value.shape[0] // num_splits
+            mapped_weights[new_name] = value[i * shape : (i + 1) * shape]
+    model.mapper.load_state_dict(mapped_weights)
+    # load final layer norm
+    model.final_layer_norm.load_state_dict(
+        {
+            "bias": checkpoint["cond_stage_model.final_ln.bias"],
+            "weight": checkpoint["cond_stage_model.final_ln.weight"],
+        }
+    )
+    # load final proj
+    model.proj_out.load_state_dict(
+        {
+            "bias": checkpoint["proj_out.bias"],
+            "weight": checkpoint["proj_out.weight"],
+        }
+    )
+    # load uncond vector
+    model.uncond_vector.data = torch.nn.Parameter(checkpoint["learnable_vector"])
+    return model
+def convert_open_clip_checkpoint(checkpoint):
+    text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+    if "cond_stage_model.model.text_projection" in checkpoint:
+        d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0])
+    else:
+        d_model = 1024
+    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
+    for key in keys:
+        if "resblocks.23" in key:  # Diffusers drops the final layer and only uses the penultimate layer
+            continue
+        if key in textenc_conversion_map:
+            text_model_dict[textenc_conversion_map[key]] = checkpoint[key]
+        if key.startswith("cond_stage_model.model.transformer."):
+            new_key = key[len("cond_stage_model.model.transformer.") :]
+            if new_key.endswith(".in_proj_weight"):
+                new_key = new_key[: -len(".in_proj_weight")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
+                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
+                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
+            elif new_key.endswith(".in_proj_bias"):
+                new_key = new_key[: -len(".in_proj_bias")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
+                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
+                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
+            else:
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key] = checkpoint[key]
+    text_model.load_state_dict(text_model_dict)
+    return text_model
+def stable_unclip_image_encoder(original_config):
+    """
+    Returns the image processor and clip image encoder for the img2img unclip pipeline.
+    We currently know of two types of stable unclip models which separately use the clip and the openclip image
+    encoders.
+    """
+    image_embedder_config = original_config.model.params.embedder_config
+    sd_clip_image_embedder_class = image_embedder_config.target
+    sd_clip_image_embedder_class = sd_clip_image_embedder_class.split(".")[-1]
+    if sd_clip_image_embedder_class == "ClipImageEmbedder":
+        clip_model_name = image_embedder_config.params.model
+        if clip_model_name == "ViT-L/14":
+            feature_extractor = CLIPImageProcessor()
+            image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+        else:
+            raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}")
+    elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder":
+        feature_extractor = CLIPImageProcessor()
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+    else:
+        raise NotImplementedError(
+            f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}"
+        )
+    return feature_extractor, image_encoder
+def stable_unclip_image_noising_components(
+    original_config, clip_stats_path: Optional[str] = None, device: Optional[str] = None
+):
+    """
+    Returns the noising components for the img2img and txt2img unclip pipelines.
+    Converts the stability noise augmentor into
+    1. a `StableUnCLIPImageNormalizer` for holding the CLIP stats
+    2. a `DDPMScheduler` for holding the noise schedule
+    If the noise augmentor config specifies a clip stats path, the `clip_stats_path` must be provided.
+    """
+    noise_aug_config = original_config.model.params.noise_aug_config
+    noise_aug_class = noise_aug_config.target
+    noise_aug_class = noise_aug_class.split(".")[-1]
+    if noise_aug_class == "CLIPEmbeddingNoiseAugmentation":
+        noise_aug_config = noise_aug_config.params
+        embedding_dim = noise_aug_config.timestep_dim
+        max_noise_level = noise_aug_config.noise_schedule_config.timesteps
+        beta_schedule = noise_aug_config.noise_schedule_config.beta_schedule
+        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedding_dim)
+        image_noising_scheduler = DDPMScheduler(num_train_timesteps=max_noise_level, beta_schedule=beta_schedule)
+        if "clip_stats_path" in noise_aug_config:
+            if clip_stats_path is None:
+                raise ValueError("This stable unclip config requires a `clip_stats_path`")
+            clip_mean, clip_std = torch.load(clip_stats_path, map_location=device)
+            clip_mean = clip_mean[None, :]
+            clip_std = clip_std[None, :]
+            clip_stats_state_dict = {
+                "mean": clip_mean,
+                "std": clip_std,
+            }
+            image_normalizer.load_state_dict(clip_stats_state_dict)
+    else:
+        raise NotImplementedError(f"Unknown noise augmentor class: {noise_aug_class}")
+    return image_normalizer, image_noising_scheduler
+def convert_controlnet_checkpoint(
+    checkpoint, original_config, checkpoint_path, image_size, upcast_attention, extract_ema
+):
+    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
+    ctrlnet_config["upcast_attention"] = upcast_attention
+    ctrlnet_config.pop("sample_size")
+    controlnet_model = ControlNetModel(**ctrlnet_config)
+    converted_ctrl_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, ctrlnet_config, path=checkpoint_path, extract_ema=extract_ema, controlnet=True
+    )
+    controlnet_model.load_state_dict(converted_ctrl_checkpoint)
+    return controlnet_model

animatediff/utils/convert_lora_safetensor_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# coding=utf-8
+# Copyright 2023, Haofan Wang, Qixun Wang, All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the LoRA's safetensors checkpoints. """
+import argparse
+import torch
+from safetensors.torch import load_file
+from diffusers import StableDiffusionPipeline
+import pdb
+def convert_lora(pipeline, state_dict, LORA_PREFIX_UNET="lora_unet", LORA_PREFIX_TEXT_ENCODER="lora_te", alpha=0.6):
+    # load base model
+    # pipeline = StableDiffusionPipeline.from_pretrained(base_model_path, torch_dtype=torch.float32)
+    # load LoRA weight from .safetensors
+    # state_dict = load_file(checkpoint_path)
+    visited = []
+    # directly update weight in diffusers model
+    for key in state_dict:
+        # it is suggested to print out the key, it usually will be something like below
+        # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"
+        # as we have set the alpha beforehand, so just skip
+        if ".alpha" in key or key in visited:
+            continue
+        if "text" in key:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
+            curr_layer = pipeline.text_encoder
+        else:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
+            curr_layer = pipeline.unet
+        # find the target layer
+        temp_name = layer_infos.pop(0)
+        while len(layer_infos) > -1:
+            try:
+                curr_layer = curr_layer.__getattr__(temp_name)
+                if len(layer_infos) > 0:
+                    temp_name = layer_infos.pop(0)
+                elif len(layer_infos) == 0:
+                    break
+            except Exception:
+                if len(temp_name) > 0:
+                    temp_name += "_" + layer_infos.pop(0)
+                else:
+                    temp_name = layer_infos.pop(0)
+        pair_keys = []
+        if "lora_down" in key:
+            pair_keys.append(key.replace("lora_down", "lora_up"))
+            pair_keys.append(key)
+        else:
+            pair_keys.append(key)
+            pair_keys.append(key.replace("lora_up", "lora_down"))
+        # update weight
+        if len(state_dict[pair_keys[0]].shape) == 4:
+            weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3).to(curr_layer.weight.data.device)
+        else:
+            weight_up = state_dict[pair_keys[0]].to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)
+        # update visited list
+        for item in pair_keys:
+            visited.append(item)
+    return pipeline
+def convert_lora_model_level(state_dict, unet, text_encoder=None, LORA_PREFIX_UNET="lora_unet", LORA_PREFIX_TEXT_ENCODER="lora_te", alpha=0.6):
+    """convert lora in model level instead of pipeline leval
+    """
+    visited = []
+    # directly update weight in diffusers model
+    for key in state_dict:
+        # it is suggested to print out the key, it usually will be something like below
+        # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"
+        # as we have set the alpha beforehand, so just skip
+        if ".alpha" in key or key in visited:
+            continue
+        if "text" in key:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
+            assert text_encoder is not None, (
+                'text_encoder must be passed since lora contains text encoder layers')
+            curr_layer = text_encoder
+        else:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
+            curr_layer = unet
+        # find the target layer
+        temp_name = layer_infos.pop(0)
+        while len(layer_infos) > -1:
+            try:
+                curr_layer = curr_layer.__getattr__(temp_name)
+                if len(layer_infos) > 0:
+                    temp_name = layer_infos.pop(0)
+                elif len(layer_infos) == 0:
+                    break
+            except Exception:
+                if len(temp_name) > 0:
+                    temp_name += "_" + layer_infos.pop(0)
+                else:
+                    temp_name = layer_infos.pop(0)
+        pair_keys = []
+        if "lora_down" in key:
+            pair_keys.append(key.replace("lora_down", "lora_up"))
+            pair_keys.append(key)
+        else:
+            pair_keys.append(key)
+            pair_keys.append(key.replace("lora_up", "lora_down"))
+        # update weight
+        # NOTE: load lycon, meybe have bugs :(
+        if 'conv_in' in pair_keys[0]:
+            weight_up = state_dict[pair_keys[0]].to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].to(torch.float32)
+            weight_up = weight_up.view(weight_up.size(0), -1)
+            weight_down = weight_down.view(weight_down.size(0), -1)
+            shape = [e for e in curr_layer.weight.data.shape]
+            shape[1] = 4
+            curr_layer.weight.data[:, :4, ...] += alpha * (weight_up @ weight_down).view(*shape)
+        elif 'conv' in pair_keys[0]:
+            weight_up = state_dict[pair_keys[0]].to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].to(torch.float32)
+            weight_up = weight_up.view(weight_up.size(0), -1)
+            weight_down = weight_down.view(weight_down.size(0), -1)
+            shape = [e for e in curr_layer.weight.data.shape]
+            curr_layer.weight.data += alpha * (weight_up @ weight_down).view(*shape)
+        elif len(state_dict[pair_keys[0]].shape) == 4:
+            weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3).to(curr_layer.weight.data.device)
+        else:
+            weight_up = state_dict[pair_keys[0]].to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)
+        # update visited list
+        for item in pair_keys:
+            visited.append(item)
+    return unet, text_encoder
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base_model_path", default=None, type=str, required=True, help="Path to the base model in diffusers format."
+    )
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument(
+        "--lora_prefix_unet", default="lora_unet", type=str, help="The prefix of UNet weight in safetensors"
+    )
+    parser.add_argument(
+        "--lora_prefix_text_encoder",
+        default="lora_te",
+        type=str,
+        help="The prefix of text encoder weight in safetensors",
+    )
+    parser.add_argument("--alpha", default=0.75, type=float, help="The merging ratio in W = W0 + alpha * deltaW")
+    parser.add_argument(
+        "--to_safetensors", action="store_true", help="Whether to store pipeline in safetensors format or not."
+    )
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    args = parser.parse_args()
+    base_model_path = args.base_model_path
+    checkpoint_path = args.checkpoint_path
+    dump_path = args.dump_path
+    lora_prefix_unet = args.lora_prefix_unet
+    lora_prefix_text_encoder = args.lora_prefix_text_encoder
+    alpha = args.alpha
+    pipe = convert(base_model_path, checkpoint_path, lora_prefix_unet, lora_prefix_text_encoder, alpha)
+    pipe = pipe.to(args.device)
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)

animatediff/utils/util.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import os
+import imageio
+import numpy as np
+from typing import Union, Optional
+import torch
+import torchvision
+import torch.distributed as dist
+from tqdm import tqdm
+from einops import rearrange
+import cv2
+import math
+import moviepy.editor as mpy
+from PIL import Image
+# We recommend to use the following affinity score(motion magnitude)
+# Also encourage to try to construct different score by yourself
+# RANGE_LIST = [
+#         [1.0, 0.9, 0.85, 0.85, 0.85, 0.8], # 0 Small Motion
+#         [1.0, 0.8, 0.8, 0.8, 0.79, 0.78, 0.75], # Moderate Motion
+#         [1.0, 0.8, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.6, 0.5, 0.5], # Large Motion
+#         # [1.0, 0.8, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.6], # Large Motion
+#         # [1.0, 0.65, 0.6], # candidate moderate
+#         # [1.0, 0.65, 0.6, 0.6, 0.6, 0.5, 0.5, 0.5, 0.5, 0.4], # candidate large
+#         [1.0 , 0.9 , 0.85, 0.85, 0.85, 0.8 , 0.8 , 0.8 , 0.8 , 0.8 , 0.8 , 0.8 , 0.85, 0.85, 0.9 , 1.0 ], # Loop
+#         [1.0 , 0.8 , 0.8 , 0.8 , 0.79, 0.78, 0.75, 0.75, 0.75, 0.75, 0.75, 0.78, 0.79, 0.8 , 0.8 , 1.0 ], # Loop
+#         [1.0 , 0.8 , 0.7 , 0.7 , 0.7 , 0.7 , 0.6 , 0.5 , 0.5 , 0.6 , 0.7 , 0.7 , 0.7 , 0.7 , 0.8 , 1.0 ], # Loop
+#         # [1.0], # Static
+#         # [0],
+#         # [0.6, 0.5, 0.5, 0.45, 0.45, 0.4], # Style Transfer Test
+#         # [0.4, 0.3, 0.3, 0.25, 0.25, 0.2], # Style Transfer
+#         [0.5, 0.2], # Style Transfer Large Motion
+#         [0.5, 0.4, 0.4, 0.4, 0.35, 0.35, 0.3, 0.25, 0.2], # Style Transfer Moderate Motion
+#         [0.5, 0.4, 0.4, 0.4, 0.35, 0.3], # Style Transfer Candidate Small Motion
+# ]
+RANGE_LIST = [
+        [0.5, 0.4, 0.4, 0.4, 0.35, 0.3], # Style Transfer Candidate Small Motion
+        [0.5, 0.4, 0.4, 0.4, 0.35, 0.35, 0.3, 0.25, 0.2], # Style Transfer Moderate Motion
+        [0.5, 0.2], # Style Transfer Large Motion
+]
+def zero_rank_print(s):
+    if (not dist.is_initialized()) or (dist.is_initialized() and dist.get_rank() == 0): print("### " + s)
+def save_videos_mp4(video: torch.Tensor, path: str, fps: int=8):
+    video = rearrange(video, "b c t h w -> t b c h w")
+    num_frames, batch_size, channels, height, width = video.shape
+    assert batch_size == 1,\
+    'Only support batch size == 1'
+    video = video.squeeze(1)
+    video = rearrange(video, "t c h w -> t h w c")
+    def make_frame(t):
+        frame_tensor = video[int(t * fps)]
+        frame_np = (frame_tensor * 255).numpy().astype('uint8')
+        return frame_np
+    clip = mpy.VideoClip(make_frame, duration=num_frames / fps)
+    clip.write_videofile(path, fps=fps, codec='libx264')
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8):
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = torch.clamp((x * 255), 0, 255).numpy().astype(np.uint8)
+        outputs.append(x)
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    imageio.mimsave(path, outputs, fps=fps)
+# DDIM Inversion
+@torch.no_grad()
+def init_prompt(prompt, pipeline):
+    uncond_input = pipeline.tokenizer(
+        [""], padding="max_length", max_length=pipeline.tokenizer.model_max_length,
+        return_tensors="pt"
+    )
+    uncond_embeddings = pipeline.text_encoder(uncond_input.input_ids.to(pipeline.device))[0]
+    text_input = pipeline.tokenizer(
+        [prompt],
+        padding="max_length",
+        max_length=pipeline.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_embeddings = pipeline.text_encoder(text_input.input_ids.to(pipeline.device))[0]
+    context = torch.cat([uncond_embeddings, text_embeddings])
+    return context
+def next_step(model_output: Union[torch.FloatTensor, np.ndarray], timestep: int,
+              sample: Union[torch.FloatTensor, np.ndarray], ddim_scheduler):
+    timestep, next_timestep = min(
+        timestep - ddim_scheduler.config.num_train_timesteps // ddim_scheduler.num_inference_steps, 999), timestep
+    alpha_prod_t = ddim_scheduler.alphas_cumprod[timestep] if timestep >= 0 else ddim_scheduler.final_alpha_cumprod
+    alpha_prod_t_next = ddim_scheduler.alphas_cumprod[next_timestep]
+    beta_prod_t = 1 - alpha_prod_t
+    next_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
+    next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output
+    next_sample = alpha_prod_t_next ** 0.5 * next_original_sample + next_sample_direction
+    return next_sample
+def get_noise_pred_single(latents, t, context, unet):
+    noise_pred = unet(latents, t, encoder_hidden_states=context)["sample"]
+    return noise_pred
+@torch.no_grad()
+def ddim_loop(pipeline, ddim_scheduler, latent, num_inv_steps, prompt):
+    context = init_prompt(prompt, pipeline)
+    uncond_embeddings, cond_embeddings = context.chunk(2)
+    all_latent = [latent]
+    latent = latent.clone().detach()
+    for i in tqdm(range(num_inv_steps)):
+        t = ddim_scheduler.timesteps[len(ddim_scheduler.timesteps) - i - 1]
+        noise_pred = get_noise_pred_single(latent, t, cond_embeddings, pipeline.unet)
+        latent = next_step(noise_pred, t, latent, ddim_scheduler)
+        all_latent.append(latent)
+    return all_latent
+@torch.no_grad()
+def ddim_inversion(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt=""):
+    ddim_latents = ddim_loop(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt)
+    return ddim_latents
+def prepare_mask_coef(video_length:int, cond_frame:int, sim_range:list=[0.2, 1.0]):
+    assert len(sim_range) == 2, \
+    'sim_range should has the length of 2, including the min and max similarity'
+    assert video_length > 1, \
+    'video_length should be greater than 1'
+    assert video_length > cond_frame,\
+    'video_length should be greater than cond_frame'
+    diff = abs(sim_range[0] - sim_range[1]) / (video_length - 1)
+    coef = [1.0] * video_length
+    for f in range(video_length):
+        f_diff = diff * abs(cond_frame - f)
+        f_diff = 1 - f_diff
+        coef[f] *= f_diff
+    return coef
+def prepare_mask_coef_by_score(video_shape: list, cond_frame_idx: list, sim_range: list = [0.2, 1.0],
+    statistic: list = [1, 100], coef_max: int = 0.98, score: Optional[torch.Tensor] = None):
+    '''
+        the shape of video_data is (b f c h w)
+        cond_frame_idx is a list, with length of batch_size
+        the shape of statistic  is (f 2)
+        the shape of score      is (b f)
+        the shape of coef       is (b f)
+    '''
+    assert len(video_shape) == 2, \
+    f'the shape of video_shape should be (b f c h w), but now get {len(video_shape.shape)} channels'
+    batch_size, frame_num = video_shape[0], video_shape[1]
+    score = score.permute(0, 2, 1).squeeze(0)
+    # list -> b 1
+    cond_fram_mat = torch.tensor(cond_frame_idx).unsqueeze(-1)
+    statistic = torch.tensor(statistic)
+    # (f 2) -> (b f 2)
+    statistic = statistic.repeat(batch_size, 1, 1)
+    # shape of order (b f), shape of cond_mat (b f)
+    order     = torch.arange(0, frame_num, 1)
+    order     = order.repeat(batch_size, 1)
+    cond_mat  = torch.ones((batch_size, frame_num)) * cond_fram_mat
+    order     = abs(order - cond_mat)
+    statistic = statistic[:,order.to(torch.long)][0,:,:,:]
+    # score (b f) max_s (b f 1)
+    max_stats = torch.max(statistic, dim=2).values.to(dtype=score.dtype)
+    min_stats = torch.min(statistic, dim=2).values.to(dtype=score.dtype)
+    score[score > max_stats] = max_stats[score > max_stats] * 0.95
+    score[score < min_stats] = min_stats[score < min_stats]
+    eps       = 1e-10
+    coef      = 1 - abs((score / (max_stats + eps)) * (max(sim_range) - min(sim_range)))
+    indices = torch.arange(coef.shape[0]).unsqueeze(1)
+    coef[indices, cond_fram_mat] = 1.0
+    return coef
+def prepare_mask_coef_by_statistics(video_length: int, cond_frame: int, sim_range: int,
+                                    coef: Optional[list] = None):
+    """
+    coef: User defined coef, if passed, `sim_range` index will be ignored. This is useful
+        for defining custom style transform coef for different models.
+    """
+    assert video_length > 1, \
+    'video_length should be greater than 1'
+    assert video_length > cond_frame,\
+    'video_length should be greater than cond_frame'
+    # Recommend index: 13
+    # range_list = [
+    #     # [0.8, 0.8, 0.7, 0.6],
+    #     [1.0, 0.8, 0.7, 0.6],
+    #     [1.0, 0.8, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.6, 0.5, 0.5],
+    #     [1.0, 0.9, 0.85, 0.85, 0.85, 0.8], # 0
+    #     [1.0, 0.9, 0.8, 0.7],
+    #     [1.0, 0.8, 0.7, 0.6, 0.7, 0.6],
+    #     [1.0, 0.9, 0.85],
+    #     # [1.0, 0.9, 0.7, 0.5, 0.3, 0.2],
+    #     # [1.0, 0.8, 0.6, 0.4],
+    #     # [1.0, 0.65, 0.6], # 1
+    #     [1.0, 0.6, 0.4], # 2
+    #     [1.0, 0.2, 0.2],
+    #     # [1.0, 0.8, 0.6, 0.6, 0.5, 0.5, 0.4],
+    #     # [1.0, 0.9, 0.9, 0.9, 0.9, 0.8],
+    #     # [1.0, 0.65, 0.6, 0.6, 0.5, 0.5, 0.4],
+    #     # [1.0, 0.9, 0.9, 0.9, 0.7, 0.7, 0.6, 0.5, 0.4],
+    #     [1.0, 0.8, 0.8, 0.8, 0.79, 0.78, 0.75], # 4 style_transfer
+    #     [1.0, 0.9, 0.9],
+    #     [0.8, 0.7, 0.6],
+    #     [0.8, 0.8, 0.8, 0.8, 0.7],
+    #     [0.9, 0.6, 0.6, 0.6, 0.5, 0.4, 0.2],
+    #     # [1.0, 0.91, 0.9, 0.89, 0.88, 0.87],
+    #     # [1.0, 0.7, 0.65, 0.65, 0.65, 0.65, 0.6],
+    #     # [1.0, 0.85, 0.9, 0.85, 0.9, 0.85],
+    #     # [1.0, 0.8, 0.82, 0.84, 0.86, 0.88, 0.78, 0.82, 0.84],
+    #     # [1.0],
+    # ]
+    range_list = RANGE_LIST
+    assert sim_range < len(range_list),\
+    f'sim_range type{sim_range} not implemented'
+    if coef is None:
+        coef = range_list[sim_range]
+    coef = coef + ([coef[-1]] * (video_length - len(coef)))
+    order = [abs(i - cond_frame) for i in range(video_length)]
+    coef  = [coef[order[i]] for i in range(video_length)]
+    return coef

app-counterfeit-only.py ADDED Viewed

	@@ -0,0 +1,441 @@

+import json
+import os
+import os.path as osp
+import random
+from argparse import ArgumentParser
+from datetime import datetime
+import gradio as gr
+import numpy as np
+import openxlab
+import torch
+from diffusers import DDIMScheduler, EulerDiscreteScheduler, PNDMScheduler
+from omegaconf import OmegaConf
+from openxlab.model import download
+from PIL import Image
+from animatediff.pipelines import I2VPipeline
+from animatediff.utils.util import RANGE_LIST, save_videos_grid
+sample_idx = 0
+scheduler_dict = {
+    "DDIM": DDIMScheduler,
+    "Euler": EulerDiscreteScheduler,
+    "PNDM": PNDMScheduler,
+}
+css = """
+.toolbutton {
+    margin-buttom: 0em 0em 0em 0em;
+    max-width: 2.5em;
+    min-width: 2.5em !important;
+    height: 2.5em;
+}
+"""
+parser = ArgumentParser()
+parser.add_argument('--config', type=str, default='example/config/base.yaml')
+parser.add_argument('--server-name', type=str, default='0.0.0.0')
+parser.add_argument('--port', type=int, default=7860)
+parser.add_argument('--share', action='store_true')
+parser.add_argument('--local-debug', action='store_true')
+parser.add_argument('--save-path', default='samples')
+args = parser.parse_args()
+LOCAL_DEBUG = args.local_debug
+BASE_CONFIG = 'example/config/base.yaml'
+STYLE_CONFIG_LIST = {
+    'anime': './example/openxlab/2-animation.yaml',
+}
+# download models
+PIA_PATH = './models/PIA'
+VAE_PATH = './models/VAE'
+DreamBooth_LoRA_PATH = './models/DreamBooth_LoRA'
+if not LOCAL_DEBUG:
+    CACHE_PATH = '/home/xlab-app-center/.cache/model'
+    PIA_PATH = osp.join(CACHE_PATH, 'PIA')
+    VAE_PATH = osp.join(CACHE_PATH, 'VAE')
+    DreamBooth_LoRA_PATH = osp.join(CACHE_PATH, 'DreamBooth_LoRA')
+    STABLE_DIFFUSION_PATH = osp.join(CACHE_PATH, 'StableDiffusion')
+    IP_ADAPTER_PATH = osp.join(CACHE_PATH, 'IP_Adapter')
+    os.makedirs(PIA_PATH, exist_ok=True)
+    os.makedirs(VAE_PATH, exist_ok=True)
+    os.makedirs(DreamBooth_LoRA_PATH, exist_ok=True)
+    os.makedirs(STABLE_DIFFUSION_PATH, exist_ok=True)
+    openxlab.login(os.environ['OPENXLAB_AK'], os.environ['OPENXLAB_SK'])
+    download(model_repo='zhangyiming/PIA-pruned', model_name='PIA', output=PIA_PATH)
+    download(model_repo='zhangyiming/Counterfeit-V3.0',
+             model_name='Counterfeit-V3.0_fp32_pruned', output=DreamBooth_LoRA_PATH)
+    download(model_repo='zhangyiming/kl-f8-anime2_VAE',
+             model_name='kl-f8-anime2', output=VAE_PATH)
+    # ip_adapter
+    download(model_repo='zhangyiming/IP-Adapter',
+             model_name='clip_encoder', output=osp.join(IP_ADAPTER_PATH, 'image_encoder'))
+    download(model_repo='zhangyiming/IP-Adapter',
+             model_name='config', output=osp.join(IP_ADAPTER_PATH, 'image_encoder'))
+    download(model_repo='zhangyiming/IP-Adapter',
+             model_name='ip_adapter_sd15', output=IP_ADAPTER_PATH)
+    # unet
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_Unet',
+             model_name='unet', output=osp.join(STABLE_DIFFUSION_PATH, 'unet'))
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_Unet',
+             model_name='config', output=osp.join(STABLE_DIFFUSION_PATH, 'unet'))
+    # vae
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_VAE',
+             model_name='vae', output=osp.join(STABLE_DIFFUSION_PATH, 'vae'))
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_VAE',
+             model_name='config', output=osp.join(STABLE_DIFFUSION_PATH, 'vae'))
+    # text encoder
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_TextEncod',
+             model_name='text_encoder', output=osp.join(STABLE_DIFFUSION_PATH, 'text_encoder'))
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_TextEncod',
+             model_name='config', output=osp.join(STABLE_DIFFUSION_PATH, 'text_encoder'))
+    # tokenizer
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_Tokenizer',
+             model_name='merge', output=osp.join(STABLE_DIFFUSION_PATH, 'tokenizer'))
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_Tokenizer',
+             model_name='special_tokens_map', output=osp.join(STABLE_DIFFUSION_PATH, 'tokenizer'))
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_Tokenizer',
+             model_name='tokenizer_config', output=osp.join(STABLE_DIFFUSION_PATH, 'tokenizer'))
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_Tokenizer',
+             model_name='vocab', output=osp.join(STABLE_DIFFUSION_PATH, 'tokenizer'))
+    # scheduler
+    scheduler_dict = {
+        "_class_name": "PNDMScheduler",
+        "_diffusers_version": "0.6.0",
+        "beta_end": 0.012,
+        "beta_schedule": "scaled_linear",
+        "beta_start": 0.00085,
+        "num_train_timesteps": 1000,
+        "set_alpha_to_one": False,
+        "skip_prk_steps": True,
+        "steps_offset": 1,
+        "trained_betas": None,
+        "clip_sample": False
+    }
+    os.makedirs(osp.join(STABLE_DIFFUSION_PATH, 'scheduler'), exist_ok=True)
+    with open(osp.join(STABLE_DIFFUSION_PATH, 'scheduler', 'scheduler_config.json'), 'w') as file:
+        json.dump(scheduler_dict, file)
+    # model index
+    model_index_dict = {
+        "_class_name": "StableDiffusionPipeline",
+        "_diffusers_version": "0.6.0",
+        "feature_extractor": [
+            "transformers",
+            "CLIPImageProcessor"
+        ],
+        "safety_checker": [
+            "stable_diffusion",
+            "StableDiffusionSafetyChecker"
+        ],
+        "scheduler": [
+            "diffusers",
+            "PNDMScheduler"
+        ],
+        "text_encoder": [
+            "transformers",
+            "CLIPTextModel"
+        ],
+        "tokenizer": [
+            "transformers",
+            "CLIPTokenizer"
+        ],
+        "unet": [
+            "diffusers",
+            "UNet2DConditionModel"
+        ],
+        "vae": [
+            "diffusers",
+            "AutoencoderKL"
+        ]
+    }
+    with open(osp.join(STABLE_DIFFUSION_PATH, 'model_index.json'), 'w') as file:
+        json.dump(model_index_dict, file)
+else:
+    PIA_PATH = './models/PIA'
+    VAE_PATH = './models/VAE'
+    DreamBooth_LoRA_PATH = './models/DreamBooth_LoRA'
+    STABLE_DIFFUSION_PATH = './models/StableDiffusion/sd15'
+def preprocess_img(img_np, max_size: int = 512):
+    ori_image = Image.fromarray(img_np).convert('RGB')
+    width, height = ori_image.size
+    short_edge = max(width, height)
+    if short_edge > max_size:
+        scale_factor = max_size / short_edge
+    else:
+        scale_factor = 1
+    width = int(width * scale_factor)
+    height = int(height * scale_factor)
+    ori_image = ori_image.resize((width, height))
+    if (width % 8 != 0) or (height % 8 != 0):
+        in_width = (width // 8) * 8
+        in_height = (height // 8) * 8
+    else:
+        in_width = width
+        in_height = height
+        in_image = ori_image
+    in_image = ori_image.resize((in_width, in_height))
+    in_image_np = np.array(in_image)
+    return in_image_np, in_height, in_width
+class AnimateController:
+    def __init__(self):
+        # config dirs
+        self.basedir = os.getcwd()
+        self.savedir = os.path.join(
+            self.basedir, args.save_path, datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S"))
+        self.savedir_sample = os.path.join(self.savedir, "sample")
+        os.makedirs(self.savedir, exist_ok=True)
+        self.inference_config = OmegaConf.load(args.config)
+        self.style_configs = {k: OmegaConf.load(
+            v) for k, v in STYLE_CONFIG_LIST.items()}
+        self.pipeline_dict = self.load_model_list()
+    def load_model_list(self):
+        pipeline_dict = dict()
+        for style, cfg in self.style_configs.items():
+            dreambooth_path = cfg.get('dreambooth', 'none')
+            if dreambooth_path and dreambooth_path.upper() != 'NONE':
+                dreambooth_path = osp.join(
+                    DreamBooth_LoRA_PATH, dreambooth_path)
+            lora_path = cfg.get('lora', None)
+            if lora_path is not None:
+                lora_path = osp.join(DreamBooth_LoRA_PATH, lora_path)
+            lora_alpha = cfg.get('lora_alpha', 0.0)
+            vae_path = cfg.get('vae', None)
+            if vae_path is not None:
+                vae_path = osp.join(VAE_PATH, vae_path)
+            pipeline_dict[style] = I2VPipeline.build_pipeline(
+                self.inference_config,
+                STABLE_DIFFUSION_PATH,
+                unet_path=osp.join(PIA_PATH, 'pia.ckpt'),
+                dreambooth_path=dreambooth_path,
+                lora_path=lora_path,
+                lora_alpha=lora_alpha,
+                vae_path=vae_path,
+                ip_adapter_path='h94/IP-Adapter',
+                ip_adapter_scale=0.1)
+        return pipeline_dict
+    def fetch_default_n_prompt(self, style: str):
+        cfg = self.style_configs[style]
+        n_prompt = cfg.get('n_prompt', '')
+        ip_adapter_scale = cfg.get('real_ip_adapter_scale', 0)
+        gr.Info('Set default negative prompt and ip_adapter_scale.')
+        print('Set default negative prompt and ip_adapter_scale.')
+        return n_prompt, ip_adapter_scale
+    def animate(
+        self,
+        init_img,
+        motion_scale,
+        prompt_textbox,
+        negative_prompt_textbox,
+        sampler_dropdown,
+        sample_step_slider,
+        cfg_scale_slider,
+        seed_textbox,
+        ip_adapter_scale,
+        style,
+        progress=gr.Progress(),
+    ):
+        if seed_textbox != -1 and seed_textbox != "":
+            torch.manual_seed(int(seed_textbox))
+        else:
+            torch.seed()
+        seed = torch.initial_seed()
+        pipeline = self.pipeline_dict[style]
+        init_img, h, w = preprocess_img(init_img)
+        sample = pipeline(
+            image=init_img,
+            prompt=prompt_textbox,
+            negative_prompt=negative_prompt_textbox,
+            num_inference_steps=sample_step_slider,
+            guidance_scale=cfg_scale_slider,
+            width=w,
+            height=h,
+            video_length=16,
+            mask_sim_template_idx=motion_scale - 1,
+            ip_adapter_scale=ip_adapter_scale,
+            progress_fn=progress,
+        ).videos
+        save_sample_path = os.path.join(
+            self.savedir_sample, f"{sample_idx}.mp4")
+        save_videos_grid(sample, save_sample_path)
+        sample_config = {
+            "prompt": prompt_textbox,
+            "n_prompt": negative_prompt_textbox,
+            "sampler": sampler_dropdown,
+            "num_inference_steps": sample_step_slider,
+            "guidance_scale": cfg_scale_slider,
+            "width": w,
+            "height": h,
+            "seed": seed,
+            "motion": motion_scale,
+        }
+        json_str = json.dumps(sample_config, indent=4)
+        with open(os.path.join(self.savedir, "logs.json"), "a") as f:
+            f.write(json_str)
+            f.write("\n\n")
+        return save_sample_path
+controller = AnimateController()
+def ui():
+    with gr.Blocks(css=css) as demo:
+        gr.HTML(
+            "<div align='center'><font size='7'> <img src=\"file/pia.png\" style=\"height: 72px;\"/ > Your Personalized Image Animator</font></div>"
+            "<div align='center'><font size='7'>via Plug-and-Play Modules in Text-to-Image Models </font></div>"
+        )
+        with gr.Row():
+            gr.Markdown(
+                "<div align='center'><font size='5'><a href='https://pi-animator.github.io/'>Project Page</a> &ensp;"  # noqa
+                "<a href='https://arxiv.org/abs/2312.13964/'>Paper</a> &ensp;"
+                "<a href='https://github.com/open-mmlab/PIA'>Code</a> &ensp;"  # noqa
+                # "Try More Style: <a href='https://openxlab.org.cn/apps/detail/zhangyiming/PiaPia'>Click Here!</a> </font></div>"  # noqa
+                "Try More Style: <a href='https://openxlab.org.cn/apps/detail/zhangyiming/PiaPia'>Click here! </a></font></div>"  # noqa
+            )
+        with gr.Row(equal_height=False):
+            with gr.Column():
+                with gr.Row():
+                    init_img = gr.Image(label='Input Image')
+                style_dropdown = gr.Dropdown(label='Style', choices=list(
+                    STYLE_CONFIG_LIST.keys()), value=list(STYLE_CONFIG_LIST.keys())[0])
+                with gr.Row():
+                    prompt_textbox = gr.Textbox(label="Prompt", lines=1)
+                    gift_button = gr.Button(
+                        value='🎁', elem_classes='toolbutton'
+                    )
+                def append_gift(prompt):
+                    rand = random.randint(0, 2)
+                    if rand == 1:
+                        prompt = prompt + 'wearing santa hats'
+                    elif rand == 2:
+                        prompt = prompt + 'lift a Christmas gift'
+                    else:
+                        prompt = prompt + 'in Christmas suit, lift a Christmas gift'
+                    gr.Info('Merry Christmas! Add magic to your prompt!')
+                    return prompt
+                gift_button.click(
+                    fn=append_gift,
+                    inputs=[prompt_textbox],
+                    outputs=[prompt_textbox],
+                )
+                prompt_textbox = gr.Textbox(label="Prompt", lines=1)
+                motion_scale_silder = gr.Slider(
+                    label='Motion Scale (Larger value means larger motion but less identity consistency)', value=2, step=1, minimum=1, maximum=len(RANGE_LIST))
+                ip_adapter_scale = gr.Slider(
+                    label='IP-Apdater Scale', value=controller.fetch_default_n_prompt(
+                        list(STYLE_CONFIG_LIST.keys())[0])[1], minimum=0, maximum=1)
+                with gr.Accordion('Advance Options', open=False):
+                    negative_prompt_textbox = gr.Textbox(
+                        value=controller.fetch_default_n_prompt(
+                            list(STYLE_CONFIG_LIST.keys())[0])[0],
+                        label="Negative prompt", lines=2)
+                    with gr.Row():
+                        sampler_dropdown = gr.Dropdown(label="Sampling method", choices=list(
+                            scheduler_dict.keys()), value=list(scheduler_dict.keys())[0])
+                        sample_step_slider = gr.Slider(
+                            label="Sampling steps", value=20, minimum=10, maximum=100, step=1)
+                    cfg_scale_slider = gr.Slider(
+                        label="CFG Scale", value=7.5, minimum=0, maximum=20)
+                    with gr.Row():
+                        seed_textbox = gr.Textbox(label="Seed", value=-1)
+                        seed_button = gr.Button(
+                            value="\U0001F3B2", elem_classes="toolbutton")
+                    seed_button.click(
+                        fn=lambda x: random.randint(1, 1e8),
+                        outputs=[seed_textbox],
+                        queue=False
+                    )
+                generate_button = gr.Button(
+                    value="Generate", variant='primary')
+            result_video = gr.Video(
+                label="Generated Animation", interactive=False)
+        style_dropdown.change(fn=controller.fetch_default_n_prompt,
+                              inputs=[style_dropdown],
+                              outputs=[negative_prompt_textbox, ip_adapter_scale], queue=False)
+        generate_button.click(
+            fn=controller.animate,
+            inputs=[
+                init_img,
+                motion_scale_silder,
+                prompt_textbox,
+                negative_prompt_textbox,
+                sampler_dropdown,
+                sample_step_slider,
+                cfg_scale_slider,
+                seed_textbox,
+                ip_adapter_scale,
+                style_dropdown,
+            ],
+            outputs=[result_video]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = ui()
+    demo.queue(max_size=10)
+    demo.launch(server_name=args.server_name,
+                server_port=args.port, share=args.share,
+                max_threads=10,
+                allowed_paths=['pia.png'])

app-huggingface.py ADDED Viewed

	@@ -0,0 +1,525 @@

+import json
+import os
+import os.path as osp
+import random
+from argparse import ArgumentParser
+from datetime import datetime
+import gradio as gr
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from omegaconf import OmegaConf
+from PIL import Image
+from animatediff.pipelines import I2VPipeline
+from animatediff.utils.util import RANGE_LIST, save_videos_grid
+sample_idx = 0
+css = """
+.toolbutton {
+    margin-buttom: 0em 0em 0em 0em;
+    max-width: 2.5em;
+    min-width: 2.5em !important;
+    height: 2.5em;
+}
+"""
+parser = ArgumentParser()
+parser.add_argument('--config', type=str, default='example/config/base.yaml')
+parser.add_argument('--server-name', type=str, default='0.0.0.0')
+parser.add_argument('--port', type=int, default=7860)
+parser.add_argument('--share', action='store_true')
+parser.add_argument('--local-debug', action='store_true')
+parser.add_argument('--save-path', default='samples')
+args = parser.parse_args()
+LOCAL_DEBUG = args.local_debug
+BASE_CONFIG = 'example/config/base.yaml'
+STYLE_CONFIG_LIST = {
+    '3d_cartoon': './example/openxlab/3-3d.yaml',
+    'realistic': './example/openxlab/1-realistic.yaml',
+}
+# download models
+PIA_PATH = './models/PIA'
+VAE_PATH = './models/VAE'
+DreamBooth_LoRA_PATH = './models/DreamBooth_LoRA'
+if not LOCAL_DEBUG:
+    CACHE_PATH = './models'
+    PIA_PATH = osp.join(CACHE_PATH, 'PIA')
+    VAE_PATH = osp.join(CACHE_PATH, 'VAE')
+    DreamBooth_LoRA_PATH = osp.join(CACHE_PATH, 'DreamBooth_LoRA')
+    STABLE_DIFFUSION_PATH = osp.join(CACHE_PATH, 'StableDiffusion')
+    os.makedirs(PIA_PATH, exist_ok=True)
+    os.makedirs(VAE_PATH, exist_ok=True)
+    os.makedirs(DreamBooth_LoRA_PATH, exist_ok=True)
+    os.makedirs(STABLE_DIFFUSION_PATH, exist_ok=True)
+    hf_hub_download(model_repo='leoxing/PIA-pruned',
+                    model_name='PIA', output=PIA_PATH)
+    os.system('bash download_bashscripts/1-RealisticVision.sh')
+    os.system('bash download_bashscripts/2-RcnzCartoon.sh')
+    print(os.listdir(DreamBooth_LoRA_PATH))
+    # unet
+    unet_full_path = hf_hub_download(repo_id='runwayml/stable-diffusion-v1-5',
+                                      subfolder='unet', filename='diffusion_pytorch_model.bin',
+                                      cache_dir='models/StableDiffusion')
+    STABLE_DIFFUSION_PATH = '/'.join(unet_full_path.split('/')[:-2])
+    hf_hub_download(repo_id='runwayml/stable-diffusion-v1-5',
+                                      subfolder='unet', filename='config.json',
+                                      cache_dir='models/StableDiffusion')
+    # vae
+    hf_hub_download(model_repo='runwayml/stable-diffusion-v1-5',
+                    subfolder='vae', filename='config.json',
+                    cache_dir='models/StableDiffusion')
+    hf_hub_download(model_repo='runwayml/stable-diffusion-v1-5',
+                    subfolder='vae', filename='diffusion_pytorch_model.bin',
+                    cache_dir='models/StableDiffusion')
+    # text encoder
+    hf_hub_download(model_repo='runwayml/stable-diffusion-v1-5',
+                    subfolder='text_encoder', filename='config.json',
+                    cache_dir='models/StableDiffusion')
+    hf_hub_download(model_repo='runwayml/stable-diffusion-v1-5',
+                    subfolder='text_encoder', filename='pytorch_model.bin',
+                    cache_dir='models/StableDiffusion')
+    # tokenizer
+    hf_hub_download(model_repo='runwayml/stable-diffusion-v1-5',
+                    subfolder='tokenizer', filename='merges.txt',
+                    cache_dir='models/StableDiffusion')
+    hf_hub_download(model_repo='runwayml/stable-diffusion-v1-5',
+                    subfolder='tokenizer', filename='special_tokens_map.json',
+                    cache_dir='models/StableDiffusion')
+    hf_hub_download(model_repo='runwayml/stable-diffusion-v1-5',
+                    subfolder='tokenizer', filename='tokenizer_config.json',
+                    cache_dir='models/StableDiffusion')
+    hf_hub_download(model_repo='runwayml/stable-diffusion-v1-5',
+                    subfolder='tokenizer', filename='vocab.json',
+                    cache_dir='models/StableDiffusion')
+    # scheduler
+    hf_hub_download(model_repo='runwayml/stable-diffusion-v1-5',
+                    subfolder='scheduler', filename='scheduler_config.json',
+                    cache_dir='models/StableDiffusion')
+    # model index
+    hf_hub_download(model_repo='runwayml/stable-diffusion-v1-5', filename='model_index.json',
+                    cache_dir='models/StableDiffusion')
+else:
+    PIA_PATH = './models/PIA'
+    VAE_PATH = './models/VAE'
+    DreamBooth_LoRA_PATH = './models/DreamBooth_LoRA'
+    STABLE_DIFFUSION_PATH = './models/StableDiffusion/sd15'
+def preprocess_img(img_np, max_size: int = 512):
+    ori_image = Image.fromarray(img_np).convert('RGB')
+    width, height = ori_image.size
+    short_edge = max(width, height)
+    if short_edge > max_size:
+        scale_factor = max_size / short_edge
+    else:
+        scale_factor = 1
+    width = int(width * scale_factor)
+    height = int(height * scale_factor)
+    ori_image = ori_image.resize((width, height))
+    if (width % 8 != 0) or (height % 8 != 0):
+        in_width = (width // 8) * 8
+        in_height = (height // 8) * 8
+    else:
+        in_width = width
+        in_height = height
+        in_image = ori_image
+    in_image = ori_image.resize((in_width, in_height))
+    in_image_np = np.array(in_image)
+    return in_image_np, in_height, in_width
+class AnimateController:
+    def __init__(self):
+        # config dirs
+        self.basedir = os.getcwd()
+        self.savedir = os.path.join(
+            self.basedir, args.save_path, datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S"))
+        self.savedir_sample = os.path.join(self.savedir, "sample")
+        os.makedirs(self.savedir, exist_ok=True)
+        self.inference_config = OmegaConf.load(args.config)
+        self.style_configs = {k: OmegaConf.load(
+            v) for k, v in STYLE_CONFIG_LIST.items()}
+        self.pipeline_dict = self.load_model_list()
+    def load_model_list(self):
+        pipeline_dict = dict()
+        for style, cfg in self.style_configs.items():
+            dreambooth_path = cfg.get('dreambooth', 'none')
+            if dreambooth_path and dreambooth_path.upper() != 'NONE':
+                dreambooth_path = osp.join(
+                    DreamBooth_LoRA_PATH, dreambooth_path)
+            lora_path = cfg.get('lora', None)
+            if lora_path is not None:
+                lora_path = osp.join(DreamBooth_LoRA_PATH, lora_path)
+            lora_alpha = cfg.get('lora_alpha', 0.0)
+            vae_path = cfg.get('vae', None)
+            if vae_path is not None:
+                vae_path = osp.join(VAE_PATH, vae_path)
+            pipeline_dict[style] = I2VPipeline.build_pipeline(
+                self.inference_config,
+                STABLE_DIFFUSION_PATH,
+                unet_path=osp.join(PIA_PATH, 'pia.ckpt'),
+                dreambooth_path=dreambooth_path,
+                lora_path=lora_path,
+                lora_alpha=lora_alpha,
+                vae_path=vae_path,
+                ip_adapter_path='h94/IP-Adapter',
+                ip_adapter_scale=0.1)
+        return pipeline_dict
+    def fetch_default_n_prompt(self, style: str):
+        cfg = self.style_configs[style]
+        n_prompt = cfg.get('n_prompt', '')
+        ip_adapter_scale = cfg.get('real_ip_adapter_scale', 0)
+        gr.Info('Set default negative prompt and ip_adapter_scale.')
+        print('Set default negative prompt and ip_adapter_scale.')
+        return n_prompt, ip_adapter_scale
+    def animate(
+        self,
+        init_img,
+        motion_scale,
+        prompt_textbox,
+        negative_prompt_textbox,
+        sample_step_slider,
+        cfg_scale_slider,
+        seed_textbox,
+        ip_adapter_scale,
+        style,
+        progress=gr.Progress(),
+    ):
+        if seed_textbox != -1 and seed_textbox != "":
+            torch.manual_seed(int(seed_textbox))
+        else:
+            torch.seed()
+        seed = torch.initial_seed()
+        pipeline = self.pipeline_dict[style]
+        init_img, h, w = preprocess_img(init_img)
+        sample = pipeline(
+            image=init_img,
+            prompt=prompt_textbox,
+            negative_prompt=negative_prompt_textbox,
+            num_inference_steps=sample_step_slider,
+            guidance_scale=cfg_scale_slider,
+            width=w,
+            height=h,
+            video_length=16,
+            mask_sim_template_idx=motion_scale - 1,
+            ip_adapter_scale=ip_adapter_scale,
+            progress_fn=progress,
+        ).videos
+        save_sample_path = os.path.join(
+            self.savedir_sample, f"{sample_idx}.mp4")
+        save_videos_grid(sample, save_sample_path)
+        sample_config = {
+            "prompt": prompt_textbox,
+            "n_prompt": negative_prompt_textbox,
+            "num_inference_steps": sample_step_slider,
+            "guidance_scale": cfg_scale_slider,
+            "width": w,
+            "height": h,
+            "seed": seed,
+            "motion": motion_scale,
+        }
+        json_str = json.dumps(sample_config, indent=4)
+        with open(os.path.join(self.savedir, "logs.json"), "a") as f:
+            f.write(json_str)
+            f.write("\n\n")
+        return save_sample_path
+controller = AnimateController()
+def ui():
+    with gr.Blocks(css=css) as demo:
+        gr.HTML(
+            "<div align='center'><font size='7'> <img src=\"file/pia.png\" style=\"height: 72px;\"/ > Your Personalized Image Animator</font></div>"
+            "<div align='center'><font size='7'>via Plug-and-Play Modules in Text-to-Image Models </font></div>"
+        )
+        with gr.Row():
+            gr.Markdown(
+                "<div align='center'><font size='5'><a href='https://pi-animator.github.io/'>Project Page</a> &ensp;"  # noqa
+                "<a href='https://arxiv.org/abs/2312.13964/'>Paper</a> &ensp;"
+                "<a href='https://github.com/open-mmlab/PIA'>Code</a> &ensp;"  # noqa
+                "Try More Style: <a href='https://openxlab.org.cn/apps/detail/zhangyiming/PiaPia-AnimationStyle'>Click here! </a></font></div>"  # noqa
+            )
+        with gr.Row(equal_height=False):
+            with gr.Column():
+                with gr.Row():
+                    init_img = gr.Image(label='Input Image')
+                style_dropdown = gr.Dropdown(label='Style', choices=list(
+                    STYLE_CONFIG_LIST.keys()), value=list(STYLE_CONFIG_LIST.keys())[0])
+                with gr.Row():
+                    prompt_textbox = gr.Textbox(label="Prompt", lines=1)
+                    gift_button = gr.Button(
+                        value='🎁', elem_classes='toolbutton'
+                    )
+                def append_gift(prompt):
+                    rand = random.randint(0, 2)
+                    if rand == 1:
+                        prompt = prompt + 'wearing santa hats'
+                    elif rand == 2:
+                        prompt = prompt + 'lift a Christmas gift'
+                    else:
+                        prompt = prompt + 'in Christmas suit, lift a Christmas gift'
+                    gr.Info('Merry Christmas! Add magic to your prompt!')
+                    return prompt
+                gift_button.click(
+                    fn=append_gift,
+                    inputs=[prompt_textbox],
+                    outputs=[prompt_textbox],
+                )
+                motion_scale_silder = gr.Slider(
+                    label='Motion Scale (Larger value means larger motion but less identity consistency)',
+                    value=1, step=1, minimum=1, maximum=len(RANGE_LIST))
+                ip_adapter_scale = gr.Slider(
+                    label='IP-Apdater Scale', value=controller.fetch_default_n_prompt(
+                        list(STYLE_CONFIG_LIST.keys())[0])[1], minimum=0, maximum=1)
+                with gr.Accordion('Advance Options', open=False):
+                    negative_prompt_textbox = gr.Textbox(
+                        value=controller.fetch_default_n_prompt(
+                            list(STYLE_CONFIG_LIST.keys())[0])[0],
+                        label="Negative prompt", lines=2)
+                    sample_step_slider = gr.Slider(
+                        label="Sampling steps", value=20, minimum=10, maximum=100, step=1)
+                    cfg_scale_slider = gr.Slider(
+                        label="CFG Scale", value=7.5, minimum=0, maximum=20)
+                    with gr.Row():
+                        seed_textbox = gr.Textbox(label="Seed", value=-1)
+                        seed_button = gr.Button(
+                            value="\U0001F3B2", elem_classes="toolbutton")
+                    seed_button.click(
+                        fn=lambda x: random.randint(1, 1e8),
+                        outputs=[seed_textbox],
+                        queue=False
+                    )
+                generate_button = gr.Button(
+                    value="Generate", variant='primary')
+            result_video = gr.Video(
+                label="Generated Animation", interactive=False)
+        style_dropdown.change(fn=controller.fetch_default_n_prompt,
+                              inputs=[style_dropdown],
+                              outputs=[negative_prompt_textbox,
+                                       ip_adapter_scale],
+                              queue=False)
+        generate_button.click(
+            fn=controller.animate,
+            inputs=[
+                init_img,
+                motion_scale_silder,
+                prompt_textbox,
+                negative_prompt_textbox,
+                sample_step_slider,
+                cfg_scale_slider,
+                seed_textbox,
+                ip_adapter_scale,
+                style_dropdown,
+            ],
+            outputs=[result_video]
+        )
+        def create_example(input_list):
+            return gr.Examples(
+                examples=input_list,
+                inputs=[
+                    init_img,
+                    result_video,
+                    prompt_textbox,
+                    negative_prompt_textbox,
+                    style_dropdown,
+                    motion_scale_silder,
+                ],
+            )
+        gr.Markdown(
+            '### Merry Christmas!'
+        )
+        create_example(
+            [
+                [
+                    '__assets__/image_animation/yiming/yiming.jpeg',
+                    '__assets__/image_animation/yiming/yiming.mp4',
+                    '1boy in Christmas suit, lift a Christmas gift',
+                    'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                    '3d_cartoon',
+                    2,
+                ],
+                [
+                    '__assets__/image_animation/yanhong/yanhong.png',
+                    '__assets__/image_animation/yanhong/yanhong.mp4',
+                    '1girl lift a Christmas gift',
+                    'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                    '3d_cartoon',
+                    2,
+                ],
+            ],
+        )
+        with gr.Accordion('More Examples for Style Transfer', open=False):
+            create_example([
+                [
+                    '__assets__/image_animation/style_transfer/anya/anya.jpg',
+                    '__assets__/image_animation/style_transfer/anya/2.mp4',
+                    '1girl open mouth ',
+                    'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                    '3d_cartoon',
+                    3,
+                ],
+                [
+                    '__assets__/image_animation/magnitude/genshin/genshin.jpg',
+                    '__assets__/image_animation/magnitude/genshin/3.mp4',
+                    'cherry blossoms in the wind, raidenshogundef, yaemikodef, best quality, 4k',
+                    'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                    '3d_cartoon',
+                    3,
+                ],
+            ])
+        with gr.Accordion('More Examples for Prompt Changing', open=False):
+            create_example(
+                [
+                    [
+                        '__assets__/image_animation/real/lighthouse.jpg',
+                        '__assets__/image_animation/real/1.mp4',
+                        'lightning, lighthouse',
+                        'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                        'realistic',
+                        1,
+                    ],
+                    [
+                        '__assets__/image_animation/real/lighthouse.jpg',
+                        '__assets__/image_animation/real/2.mp4',
+                        'sun rising, lighthouse',
+                        'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                        'realistic',
+                        1,
+                    ],
+                    [
+                        '__assets__/image_animation/real/lighthouse.jpg',
+                        '__assets__/image_animation/real/3.mp4',
+                        'fireworks, lighthouse',
+                        'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                        'realistic',
+                        1,
+                    ],
+                    [
+                        '__assets__/image_animation/rcnz/harry.png',
+                        '__assets__/image_animation/rcnz/1.mp4',
+                        '1boy smiling',
+                        'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                        '3d_cartoon',
+                        2
+                    ],
+                    [
+                        '__assets__/image_animation/rcnz/harry.png',
+                        '__assets__/image_animation/rcnz/2.mp4',
+                        '1boy playing magic fire',
+                        'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                        '3d_cartoon',
+                        2
+                    ],
+                    [
+                        '__assets__/image_animation/rcnz/harry.png',
+                        '__assets__/image_animation/rcnz/3.mp4',
+                        '1boy is waving hands',
+                        'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                        '3d_cartoon',
+                        2
+                    ]
+                ])
+            with gr.Accordion('Examples for Motion Magnitude', open=False):
+                create_example(
+                    [
+                        [
+                            '__assets__/image_animation/magnitude/labrador.png',
+                            '__assets__/image_animation/magnitude/1.mp4',
+                            'cherry blossoms in the wind, raidenshogundef, yaemikodef, best quality, 4k',
+                            'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                            '3d_cartoon',
+                            1,
+                        ],
+                        [
+                            '__assets__/image_animation/magnitude/labrador.png',
+                            '__assets__/image_animation/magnitude/2.mp4',
+                            'cherry blossoms in the wind, raidenshogundef, yaemikodef, best quality, 4k',
+                            'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                            '3d_cartoon',
+                            2,
+                        ],
+                        [
+                            '__assets__/image_animation/magnitude/labrador.png',
+                            '__assets__/image_animation/magnitude/3.mp4',
+                            'cherry blossoms in the wind, raidenshogundef, yaemikodef, best quality, 4k',
+                            'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                            '3d_cartoon',
+                            3,
+                        ]
+                    ])
+    return demo
+if __name__ == "__main__":
+    demo = ui()
+    demo.queue(max_size=10)
+    demo.launch(server_name=args.server_name,
+                server_port=args.port, share=args.share,
+                max_threads=40,
+                allowed_paths=['pia.png'])

app.py ADDED Viewed

	@@ -0,0 +1,567 @@

+import json
+import os
+import os.path as osp
+import random
+from argparse import ArgumentParser
+from datetime import datetime
+import gradio as gr
+import numpy as np
+import openxlab
+import torch
+from diffusers import DDIMScheduler, EulerDiscreteScheduler, PNDMScheduler
+from omegaconf import OmegaConf
+from openxlab.model import download
+from PIL import Image
+from animatediff.pipelines import I2VPipeline
+from animatediff.utils.util import RANGE_LIST, save_videos_grid
+sample_idx = 0
+# scheduler_dict = {
+#     "DDIM": DDIMScheduler,
+#     "Euler": EulerDiscreteScheduler,
+#     "PNDM": PNDMScheduler,
+# }
+css = """
+.toolbutton {
+    margin-buttom: 0em 0em 0em 0em;
+    max-width: 2.5em;
+    min-width: 2.5em !important;
+    height: 2.5em;
+}
+"""
+parser = ArgumentParser()
+parser.add_argument('--config', type=str, default='example/config/base.yaml')
+parser.add_argument('--server-name', type=str, default='0.0.0.0')
+parser.add_argument('--port', type=int, default=7860)
+parser.add_argument('--share', action='store_true')
+parser.add_argument('--local-debug', action='store_true')
+parser.add_argument('--save-path', default='samples')
+args = parser.parse_args()
+LOCAL_DEBUG = args.local_debug
+BASE_CONFIG = 'example/config/base.yaml'
+STYLE_CONFIG_LIST = {
+    '3d_cartoon': './example/openxlab/3-3d.yaml',
+    'realistic': './example/openxlab/1-realistic.yaml',
+}
+# download models
+PIA_PATH = './models/PIA'
+VAE_PATH = './models/VAE'
+DreamBooth_LoRA_PATH = './models/DreamBooth_LoRA'
+if not LOCAL_DEBUG:
+    CACHE_PATH = '/home/xlab-app-center/.cache/model'
+    PIA_PATH = osp.join(CACHE_PATH, 'PIA')
+    VAE_PATH = osp.join(CACHE_PATH, 'VAE')
+    DreamBooth_LoRA_PATH = osp.join(CACHE_PATH, 'DreamBooth_LoRA')
+    STABLE_DIFFUSION_PATH = osp.join(CACHE_PATH, 'StableDiffusion')
+    os.makedirs(PIA_PATH, exist_ok=True)
+    os.makedirs(VAE_PATH, exist_ok=True)
+    os.makedirs(DreamBooth_LoRA_PATH, exist_ok=True)
+    os.makedirs(STABLE_DIFFUSION_PATH, exist_ok=True)
+    openxlab.login(os.environ['OPENXLAB_AK'], os.environ['OPENXLAB_SK'])
+    download(model_repo='zhangyiming/PIA-pruned', model_name='PIA', output=PIA_PATH)
+    download(model_repo='zhangyiming/RCNZ_Cartoon_3d',
+             model_name='rcnz-cartoon-3d', output=DreamBooth_LoRA_PATH)
+    download(model_repo='zhangyiming/realisticVisionV51_v51VAE',
+             model_name='realisticVisionV51_v51VAE', output=DreamBooth_LoRA_PATH)
+    print(os.listdir(DreamBooth_LoRA_PATH))
+    # unet
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_Unet',
+             model_name='unet', output=osp.join(STABLE_DIFFUSION_PATH, 'unet'))
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_Unet',
+             model_name='config', output=osp.join(STABLE_DIFFUSION_PATH, 'unet'))
+    # vae
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_VAE',
+             model_name='vae', output=osp.join(STABLE_DIFFUSION_PATH, 'vae'))
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_VAE',
+             model_name='config', output=osp.join(STABLE_DIFFUSION_PATH, 'vae'))
+    # text encoder
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_TextEncod',
+             model_name='text_encoder', output=osp.join(STABLE_DIFFUSION_PATH, 'text_encoder'))
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_TextEncod',
+             model_name='config', output=osp.join(STABLE_DIFFUSION_PATH, 'text_encoder'))
+    # tokenizer
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_Tokenizer',
+             model_name='merge', output=osp.join(STABLE_DIFFUSION_PATH, 'tokenizer'))
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_Tokenizer',
+             model_name='special_tokens_map', output=osp.join(STABLE_DIFFUSION_PATH, 'tokenizer'))
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_Tokenizer',
+             model_name='tokenizer_config', output=osp.join(STABLE_DIFFUSION_PATH, 'tokenizer'))
+    download(model_repo='zhangyiming/runwayml_stable-diffusion-v1-5_Tokenizer',
+             model_name='vocab', output=osp.join(STABLE_DIFFUSION_PATH, 'tokenizer'))
+    # scheduler
+    scheduler_dict = {
+        "_class_name": "PNDMScheduler",
+        "_diffusers_version": "0.6.0",
+        "beta_end": 0.012,
+        "beta_schedule": "scaled_linear",
+        "beta_start": 0.00085,
+        "num_train_timesteps": 1000,
+        "set_alpha_to_one": False,
+        "skip_prk_steps": True,
+        "steps_offset": 1,
+        "trained_betas": None,
+        "clip_sample": False
+    }
+    os.makedirs(osp.join(STABLE_DIFFUSION_PATH, 'scheduler'), exist_ok=True)
+    with open(osp.join(STABLE_DIFFUSION_PATH, 'scheduler', 'scheduler_config.json'), 'w') as file:
+        json.dump(scheduler_dict, file)
+    # model index
+    model_index_dict = {
+        "_class_name": "StableDiffusionPipeline",
+        "_diffusers_version": "0.6.0",
+        "feature_extractor": [
+            "transformers",
+            "CLIPImageProcessor"
+        ],
+        "safety_checker": [
+            "stable_diffusion",
+            "StableDiffusionSafetyChecker"
+        ],
+        "scheduler": [
+            "diffusers",
+            "PNDMScheduler"
+        ],
+        "text_encoder": [
+            "transformers",
+            "CLIPTextModel"
+        ],
+        "tokenizer": [
+            "transformers",
+            "CLIPTokenizer"
+        ],
+        "unet": [
+            "diffusers",
+            "UNet2DConditionModel"
+        ],
+        "vae": [
+            "diffusers",
+            "AutoencoderKL"
+        ]
+    }
+    with open(osp.join(STABLE_DIFFUSION_PATH, 'model_index.json'), 'w') as file:
+        json.dump(model_index_dict, file)
+else:
+    PIA_PATH = './models/PIA'
+    VAE_PATH = './models/VAE'
+    DreamBooth_LoRA_PATH = './models/DreamBooth_LoRA'
+    STABLE_DIFFUSION_PATH = './models/StableDiffusion/sd15'
+def preprocess_img(img_np, max_size: int = 512):
+    ori_image = Image.fromarray(img_np).convert('RGB')
+    width, height = ori_image.size
+    short_edge = max(width, height)
+    if short_edge > max_size:
+        scale_factor = max_size / short_edge
+    else:
+        scale_factor = 1
+    width = int(width * scale_factor)
+    height = int(height * scale_factor)
+    ori_image = ori_image.resize((width, height))
+    if (width % 8 != 0) or (height % 8 != 0):
+        in_width = (width // 8) * 8
+        in_height = (height // 8) * 8
+    else:
+        in_width = width
+        in_height = height
+        in_image = ori_image
+    in_image = ori_image.resize((in_width, in_height))
+    in_image_np = np.array(in_image)
+    return in_image_np, in_height, in_width
+class AnimateController:
+    def __init__(self):
+        # config dirs
+        self.basedir = os.getcwd()
+        self.savedir = os.path.join(
+            self.basedir, args.save_path, datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S"))
+        self.savedir_sample = os.path.join(self.savedir, "sample")
+        os.makedirs(self.savedir, exist_ok=True)
+        self.inference_config = OmegaConf.load(args.config)
+        self.style_configs = {k: OmegaConf.load(
+            v) for k, v in STYLE_CONFIG_LIST.items()}
+        self.pipeline_dict = self.load_model_list()
+    def load_model_list(self):
+        pipeline_dict = dict()
+        for style, cfg in self.style_configs.items():
+            dreambooth_path = cfg.get('dreambooth', 'none')
+            if dreambooth_path and dreambooth_path.upper() != 'NONE':
+                dreambooth_path = osp.join(
+                    DreamBooth_LoRA_PATH, dreambooth_path)
+            lora_path = cfg.get('lora', None)
+            if lora_path is not None:
+                lora_path = osp.join(DreamBooth_LoRA_PATH, lora_path)
+            lora_alpha = cfg.get('lora_alpha', 0.0)
+            vae_path = cfg.get('vae', None)
+            if vae_path is not None:
+                vae_path = osp.join(VAE_PATH, vae_path)
+            pipeline_dict[style] = I2VPipeline.build_pipeline(
+                self.inference_config,
+                STABLE_DIFFUSION_PATH,
+                unet_path=osp.join(PIA_PATH, 'pia.ckpt'),
+                dreambooth_path=dreambooth_path,
+                lora_path=lora_path,
+                lora_alpha=lora_alpha,
+                vae_path=vae_path,
+                ip_adapter_path='h94/IP-Adapter',
+                ip_adapter_scale=0.1)
+        return pipeline_dict
+    def fetch_default_n_prompt(self, style: str):
+        cfg = self.style_configs[style]
+        n_prompt = cfg.get('n_prompt', '')
+        ip_adapter_scale = cfg.get('real_ip_adapter_scale', 0)
+        gr.Info('Set default negative prompt and ip_adapter_scale.')
+        print('Set default negative prompt and ip_adapter_scale.')
+        return n_prompt, ip_adapter_scale
+    def animate(
+        self,
+        init_img,
+        motion_scale,
+        prompt_textbox,
+        negative_prompt_textbox,
+        sample_step_slider,
+        cfg_scale_slider,
+        seed_textbox,
+        ip_adapter_scale,
+        style,
+        progress=gr.Progress(),
+    ):
+        if seed_textbox != -1 and seed_textbox != "":
+            torch.manual_seed(int(seed_textbox))
+        else:
+            torch.seed()
+        seed = torch.initial_seed()
+        pipeline = self.pipeline_dict[style]
+        init_img, h, w = preprocess_img(init_img)
+        sample = pipeline(
+            image=init_img,
+            prompt=prompt_textbox,
+            negative_prompt=negative_prompt_textbox,
+            num_inference_steps=sample_step_slider,
+            guidance_scale=cfg_scale_slider,
+            width=w,
+            height=h,
+            video_length=16,
+            mask_sim_template_idx=motion_scale - 1,
+            ip_adapter_scale=ip_adapter_scale,
+            progress_fn=progress,
+        ).videos
+        save_sample_path = os.path.join(
+            self.savedir_sample, f"{sample_idx}.mp4")
+        save_videos_grid(sample, save_sample_path)
+        sample_config = {
+            "prompt": prompt_textbox,
+            "n_prompt": negative_prompt_textbox,
+            "num_inference_steps": sample_step_slider,
+            "guidance_scale": cfg_scale_slider,
+            "width": w,
+            "height": h,
+            "seed": seed,
+            "motion": motion_scale,
+        }
+        json_str = json.dumps(sample_config, indent=4)
+        with open(os.path.join(self.savedir, "logs.json"), "a") as f:
+            f.write(json_str)
+            f.write("\n\n")
+        return save_sample_path
+controller = AnimateController()
+def ui():
+    with gr.Blocks(css=css) as demo:
+        gr.HTML(
+            "<div align='center'><font size='7'> <img src=\"file/pia.png\" style=\"height: 72px;\"/ > Your Personalized Image Animator</font></div>"
+            "<div align='center'><font size='7'>via Plug-and-Play Modules in Text-to-Image Models </font></div>"
+        )
+        with gr.Row():
+            gr.Markdown(
+                "<div align='center'><font size='5'><a href='https://pi-animator.github.io/'>Project Page</a> &ensp;"  # noqa
+                "<a href='https://arxiv.org/abs/2312.13964/'>Paper</a> &ensp;"
+                "<a href='https://github.com/open-mmlab/PIA'>Code</a> &ensp;"  # noqa
+                "Try More Style: <a href='https://openxlab.org.cn/apps/detail/zhangyiming/PiaPia-AnimationStyle'>Click here! </a></font></div>"  # noqa
+            )
+        with gr.Row(equal_height=False):
+            with gr.Column():
+                with gr.Row():
+                    init_img = gr.Image(label='Input Image')
+                style_dropdown = gr.Dropdown(label='Style', choices=list(
+                    STYLE_CONFIG_LIST.keys()), value=list(STYLE_CONFIG_LIST.keys())[0])
+                with gr.Row():
+                    prompt_textbox = gr.Textbox(label="Prompt", lines=1)
+                    gift_button = gr.Button(
+                        value='🎁', elem_classes='toolbutton'
+                    )
+                def append_gift(prompt):
+                    rand = random.randint(0, 2)
+                    if rand == 1:
+                        prompt = prompt + 'wearing santa hats'
+                    elif rand == 2:
+                        prompt = prompt + 'lift a Christmas gift'
+                    else:
+                        prompt = prompt + 'in Christmas suit, lift a Christmas gift'
+                    gr.Info('Merry Christmas! Add magic to your prompt!')
+                    return prompt
+                gift_button.click(
+                    fn=append_gift,
+                    inputs=[prompt_textbox],
+                    outputs=[prompt_textbox],
+                )
+                motion_scale_silder = gr.Slider(
+                    label='Motion Scale (Larger value means larger motion but less identity consistency)',
+                    value=1, step=1, minimum=1, maximum=len(RANGE_LIST))
+                ip_adapter_scale = gr.Slider(
+                    label='IP-Apdater Scale', value=controller.fetch_default_n_prompt(
+                        list(STYLE_CONFIG_LIST.keys())[0])[1], minimum=0, maximum=1)
+                with gr.Accordion('Advance Options', open=False):
+                    negative_prompt_textbox = gr.Textbox(
+                        value=controller.fetch_default_n_prompt(
+                            list(STYLE_CONFIG_LIST.keys())[0])[0],
+                        label="Negative prompt", lines=2)
+                    sample_step_slider = gr.Slider(
+                            label="Sampling steps", value=20, minimum=10, maximum=100, step=1)
+                    cfg_scale_slider = gr.Slider(
+                        label="CFG Scale", value=7.5, minimum=0, maximum=20)
+                    with gr.Row():
+                        seed_textbox = gr.Textbox(label="Seed", value=-1)
+                        seed_button = gr.Button(
+                            value="\U0001F3B2", elem_classes="toolbutton")
+                    seed_button.click(
+                        fn=lambda x: random.randint(1, 1e8),
+                        outputs=[seed_textbox],
+                        queue=False
+                    )
+                generate_button = gr.Button(
+                    value="Generate", variant='primary')
+            result_video = gr.Video(
+                label="Generated Animation", interactive=False)
+        style_dropdown.change(fn=controller.fetch_default_n_prompt,
+                              inputs=[style_dropdown],
+                              outputs=[negative_prompt_textbox,
+                                       ip_adapter_scale],
+                              queue=False)
+        generate_button.click(
+            fn=controller.animate,
+            inputs=[
+                init_img,
+                motion_scale_silder,
+                prompt_textbox,
+                negative_prompt_textbox,
+                sample_step_slider,
+                cfg_scale_slider,
+                seed_textbox,
+                ip_adapter_scale,
+                style_dropdown,
+            ],
+            outputs=[result_video]
+        )
+        def create_example(input_list):
+            return gr.Examples(
+                examples=input_list,
+                inputs=[
+                    init_img,
+                    result_video,
+                    prompt_textbox,
+                    negative_prompt_textbox,
+                    style_dropdown,
+                    motion_scale_silder,
+                ],
+            )
+        gr.Markdown(
+            '### Merry Christmas!'
+        )
+        create_example(
+            [
+                [
+                    '__assets__/image_animation/yiming/yiming.jpeg',
+                    '__assets__/image_animation/yiming/yiming.mp4',
+                    '1boy in Christmas suit, lift a Christmas gift',
+                    'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                    '3d_cartoon',
+                    2,
+                ],
+                [
+                    '__assets__/image_animation/yanhong/yanhong.png',
+                    '__assets__/image_animation/yanhong/yanhong.mp4',
+                    '1girl lift a Christmas gift',
+                    'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                    '3d_cartoon',
+                    2,
+                ],
+            ],
+        )
+        with gr.Accordion('More Examples for Style Transfer', open=False):
+            create_example([
+                [
+                    '__assets__/image_animation/style_transfer/anya/anya.jpg',
+                    '__assets__/image_animation/style_transfer/anya/2.mp4',
+                    '1girl open mouth ',
+                    'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                    '3d_cartoon',
+                    3,
+                ],
+                [
+                    '__assets__/image_animation/magnitude/genshin/genshin.jpg',
+                    '__assets__/image_animation/magnitude/genshin/3.mp4',
+                    'cherry blossoms in the wind, raidenshogundef, yaemikodef, best quality, 4k',
+                    'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                    '3d_cartoon',
+                    3,
+                ],
+            ])
+        with gr.Accordion('More Examples for Prompt Changing', open=False):
+            create_example(
+                [
+                    [
+                        '__assets__/image_animation/real/lighthouse.jpg',
+                        '__assets__/image_animation/real/1.mp4',
+                        'lightning, lighthouse',
+                        'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                        'realistic',
+                        1,
+                    ],
+                    [
+                        '__assets__/image_animation/real/lighthouse.jpg',
+                        '__assets__/image_animation/real/2.mp4',
+                        'sun rising, lighthouse',
+                        'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                        'realistic',
+                        1,
+                    ],
+                    [
+                        '__assets__/image_animation/real/lighthouse.jpg',
+                        '__assets__/image_animation/real/3.mp4',
+                        'fireworks, lighthouse',
+                        'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                        'realistic',
+                        1,
+                    ],
+                    [
+                        '__assets__/image_animation/rcnz/harry.png',
+                        '__assets__/image_animation/rcnz/1.mp4',
+                        '1boy smiling',
+                        'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                        '3d_cartoon',
+                        2
+                    ],
+                    [
+                        '__assets__/image_animation/rcnz/harry.png',
+                        '__assets__/image_animation/rcnz/2.mp4',
+                        '1boy playing magic fire',
+                        'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                        '3d_cartoon',
+                        2
+                    ],
+                    [
+                        '__assets__/image_animation/rcnz/harry.png',
+                        '__assets__/image_animation/rcnz/3.mp4',
+                        '1boy is waving hands',
+                        'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                        '3d_cartoon',
+                        2
+                    ]
+                ])
+            with gr.Accordion('Examples for Motion Magnitude', open=False):
+                create_example(
+                    [
+                        [
+                            '__assets__/image_animation/magnitude/labrador.png',
+                            '__assets__/image_animation/magnitude/1.mp4',
+                            'cherry blossoms in the wind, raidenshogundef, yaemikodef, best quality, 4k',
+                            'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                            '3d_cartoon',
+                            1,
+                        ],
+                        [
+                            '__assets__/image_animation/magnitude/labrador.png',
+                            '__assets__/image_animation/magnitude/2.mp4',
+                            'cherry blossoms in the wind, raidenshogundef, yaemikodef, best quality, 4k',
+                            'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                            '3d_cartoon',
+                            2,
+                        ],
+                        [
+                            '__assets__/image_animation/magnitude/labrador.png',
+                            '__assets__/image_animation/magnitude/3.mp4',
+                            'cherry blossoms in the wind, raidenshogundef, yaemikodef, best quality, 4k',
+                            'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg',
+                            '3d_cartoon',
+                            3,
+                        ]
+                    ])
+    return demo
+if __name__ == "__main__":
+    demo = ui()
+    demo.queue(max_size=10)
+    demo.launch(server_name=args.server_name,
+                server_port=args.port, share=args.share,
+                max_threads=40,
+                allowed_paths=['pia.png'])

benchmark.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+import argparse
+from time import sleep
+import subprocess
+from concurrent.futures import ThreadPoolExecutor
+"""
+Examples:
+    - Test AD-benchmark:
+        python benchmark.py --script=inference_ad.py --yaml_dir=configs/ad/
+    - Test Indomain:
+        python benchmark.py --script=inference.py --yaml_dir=configs/indomain/
+    - Test:
+        python benchmark.py --script=inference.py --yaml_dir=configs/indomain/myprompt --spot=True
+    - Test AnimateBench:
+        python benchmark.py --script=inference_new.py --yaml_dir=AnimateBench/config/
+"""
+parser = argparse.ArgumentParser()
+parser.add_argument('--yaml_dir', type=str, default='configs/indomain/myprompt_simple/')
+parser.add_argument('--node', type=str, default=None)
+parser.add_argument('--script', type=str, default='inference.py')
+parser.add_argument('--dreambooth', type=list, default=['toon', 'maj', 'real', 'rc', 'ly'])
+# parser.add_argument('--dreambooth', type=list, default=['toon'])
+parser.add_argument('--spot', type=bool, default=False)
+args   = parser.parse_args()
+def run_srun_command(command):
+    subprocess.run(command, shell=True)
+executor = ThreadPoolExecutor()
+for db in args.dreambooth:
+    if not args.spot:
+        command = f"srun -p mm_lol --gres=gpu:1 "
+    else:
+        command = f"srun -p mm_lol --gres=gpu:1 --quota=spot "
+    if args.node is not None:
+        command = command + f'-w {args.node} '
+    command = command + f"python {args.script} --config={os.path.join(args.yaml_dir, db + '.yaml')}"
+    executor.submit(run_srun_command, command)
+    sleep(1)

configs/indomain/base.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+generate:
+  model_path: "outputs/training32-2023-11-01T09-50-05/checkpoints/checkpoint81000.ckpt"
+validation_data:
+  mask_sim_range: 0
+  cond_frame: 0
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+  beta_start:          0.00085
+  beta_end:            0.012
+  beta_schedule:       "linear"
+  steps_offset:        1
+  clip_sample:         false

configs/indomain/real.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+base: 'configs/indomain/base.yaml'
+prompts:
+  - ['A city street filled with neon shining lights and fog.', 'A young woman in dark clothes is smiling.']
+  - ['A dramatic black sky with overcast moving clouds.', 'The wind and waves are lapping at the lighthouse on the cliff']
+n_prompt:
+  - 'wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg'
+validation_data:
+  input_name: 'init_image'
+  validation_input_path: 'benchmark_prompt/real/'
+  save_path: 'benchmark_prompt_result/real/'
+  num_inference_steps: 25
+  guidance_scale: 7.5
+  img_mask: ''
+pretrained_model_path: "/mnt/petrelfs/zhangyiming/project/Image2Video-AnimateDiff/models/StableDiffusion/"
+unet_additional_kwargs:
+  use_motion_module              : true
+  motion_module_resolutions      : [ 1,2,4,8 ]
+  unet_use_cross_frame_attention : false
+  unet_use_temporal_attention    : false
+  motion_module_type: Vanilla
+  motion_module_kwargs:
+    num_attention_heads                : 8
+    num_transformer_block              : 1
+    attention_block_types              : [ "Temporal_Self", "Temporal_Self" ]
+    temporal_position_encoding         : true
+    temporal_position_encoding_max_len : 32
+    temporal_attention_dim_div         : 1
+    zero_initialize                    : true
+generate:
+  use_image: true
+  use_video: false
+  sample_size: 512
+  video_length: 16
+  global_seed: [2022, 2023]
+  use_lora: false
+  use_db: true
+  lora_path: "models/DreamBooth_LoRA/ink_lora.safetensors"
+  db_path: "models/DreamBooth_LoRA/real.safetensors"
+  lora_alpha: 0.8

configs/inference/inference.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+unet_additional_kwargs:
+  unet_use_cross_frame_attention: false
+  unet_use_temporal_attention: false
+  use_motion_module: true
+  motion_module_resolutions:
+  - 1
+  - 2
+  - 4
+  - 8
+  motion_module_mid_block: false
+  motion_module_decoder_only: false
+  motion_module_type: Vanilla
+  motion_module_kwargs:
+    num_attention_heads: 8
+    num_transformer_block: 1
+    attention_block_types:
+    - Temporal_Self
+    - Temporal_Self
+    temporal_position_encoding: true
+    temporal_position_encoding_max_len: 32
+    temporal_attention_dim_div: 1
+noise_scheduler_kwargs:
+  beta_start: 0.00085
+  beta_end: 0.012
+  beta_schedule: "linear"

configs/prompts/1-ToonYou.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+ToonYou:
+  base: ""
+  path: "models/DreamBooth_LoRA/toonyou_beta3.safetensors"
+  motion_module:
+    - "models/Motion_Module/mm_sd_v14.ckpt"
+    - "models/Motion_Module/mm_sd_v15.ckpt"
+  seed:           [10788741199826055526, 6520604954829636163, 6519455744612555650, 16372571278361863751]
+  steps:          25
+  guidance_scale: 7.5
+  prompt:
+    - "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress"
+    - "masterpiece, best quality, 1girl, solo, cherry blossoms, hanami, pink flower, white flower, spring season, wisteria, petals, flower, plum blossoms, outdoors, falling petals, white hair, black eyes,"
+    - "best quality, masterpiece, 1boy, formal, abstract, looking at viewer, masculine, marble pattern"
+    - "best quality, masterpiece, 1girl, cloudy sky, dandelion, contrapposto, alternate hairstyle,"
+  n_prompt:
+    - ""
+    - "badhandv4,easynegative,ng_deepnegative_v1_75t,verybadimagenegative_v1.3, bad-artist, bad_prompt_version2-neg, teeth"
+    - ""
+    - ""

configs/prompts/1.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+FilmVelvia:
+  base: "models/DreamBooth_LoRA/majicmixRealistic_v5.safetensors"
+  path: "models/DreamBooth_LoRA/FilmVelvia2.safetensors"
+  motion_module:
+    - "models/Motion_Module/mm_sd_v14.ckpt"
+    - "models/Motion_Module/mm_sd_v15.ckpt"
+  seed:           [358675358833372813, 3519455280971923743, 11684545350557985081, 8696855302100399877]
+  steps:          25
+  guidance_scale: 7.5
+  lora_alpha: 0.6
+  prompt:
+    -
+  n_prompt:
+    - "cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
+    - "cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
+    - "wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
+    - "wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"

configs/prompts/2-Lyriel.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+Lyriel:
+  base: ""
+  path: "models/DreamBooth_LoRA/lyriel_v16.safetensors"
+  motion_module:
+    - "models/Motion_Module/mm_sd_v14.ckpt"
+    - "models/Motion_Module/mm_sd_v15.ckpt"
+  seed:           [10917152860782582783, 6399018107401806238, 15875751942533906793, 6653196880059936551]
+  steps:          25
+  guidance_scale: 7.5
+  prompt:
+    - "dark shot, epic realistic, portrait of halo, sunglasses, blue eyes, tartan scarf, white hair by atey ghailan, by greg rutkowski, by greg tocchini, by james gilleard, by joe fenton, by kaethe butcher, gradient yellow, black, brown and magenta color scheme, grunge aesthetic!!! graffiti tag wall background, art by greg rutkowski and artgerm, soft cinematic light, adobe lightroom, photolab, hdr, intricate, highly detailed, depth of field, faded, neutral colors, hdr, muted colors, hyperdetailed, artstation, cinematic, warm lights, dramatic light, intricate details, complex background, rutkowski, teal and orange"
+    - "A forbidden castle high up in the mountains, pixel art, intricate details2, hdr, intricate details, hyperdetailed5, natural skin texture, hyperrealism, soft light, sharp, game art, key visual, surreal"
+    - "dark theme, medieval portrait of a man sharp features, grim, cold stare, dark colors, Volumetric lighting, baroque oil painting by Greg Rutkowski, Artgerm, WLOP, Alphonse Mucha dynamic lighting hyperdetailed intricately detailed, hdr, muted colors, complex background, hyperrealism, hyperdetailed, amandine van ray"
+    - "As I have gone alone in there and with my treasures bold, I can keep my secret where and hint of riches new and old. Begin it where warm waters halt and take it in a canyon down, not far but too far to walk, put in below the home of brown."
+  n_prompt:
+    - "3d, cartoon, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, artist name, young, loli, elf, 3d, illustration"
+    - "3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular"
+    - "dof, grayscale, black and white, bw, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, girl, loli, young, large breasts, red eyes, muscular,badhandsv5-neg, By bad artist -neg 1, monochrome"
+    - "holding an item, cowboy, hat, cartoon, 3d, disfigured, bad art, deformed,extra limbs,close up,b&w, wierd colors, blurry, duplicate, morbid, mutilated, [out of frame], extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, ugly, blurry, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, out of frame, ugly, extra limbs, bad anatomy, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, mutated hands, fused fingers, too many fingers, long neck, Photoshop, video game, ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, mutation, mutated, extra limbs, extra legs, extra arms, disfigured, deformed, cross-eye, body out of frame, blurry, bad art, bad anatomy, 3d render"

configs/prompts/3-RcnzCartoon.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+RcnzCartoon:
+  base: ""
+  path: "models/DreamBooth_LoRA/rcnzCartoon3d_v10.safetensors"
+  motion_module:
+    - "models/Motion_Module/mm_sd_v14.ckpt"
+    - "models/Motion_Module/mm_sd_v15.ckpt"
+  seed:           [16931037867122267877, 2094308009433392066, 4292543217695451092, 15572665120852309890]
+  steps:          25
+  guidance_scale: 7.5
+  prompt:
+    - "Jane Eyre with headphones, natural skin texture,4mm,k textures, soft cinematic light, adobe lightroom, photolab, hdr, intricate, elegant, highly detailed, sharp focus, cinematic look, soothing tones, insane details, intricate details, hyperdetailed, low contrast, soft cinematic light, dim colors, exposure blend, hdr, faded"
+    - "close up Portrait photo of muscular bearded guy in a worn mech suit, light bokeh, intricate, steel metal [rust], elegant, sharp focus, photo by greg rutkowski, soft lighting, vibrant colors, masterpiece, streets, detailed face"
+    - "absurdres, photorealistic, masterpiece, a 30 year old man with gold framed, aviator reading glasses and a black hooded jacket and a beard, professional photo, a character portrait, altermodern, detailed eyes, detailed lips, detailed face, grey eyes"
+    - "a golden labrador, warm vibrant colours, natural lighting, dappled lighting, diffused lighting, absurdres, highres,k, uhd, hdr, rtx, unreal, octane render, RAW photo, photorealistic, global illumination, subsurface scattering"
+  n_prompt:
+    - "deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, mutated hands and fingers, disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation"
+    - "nude, cross eyed, tongue, open mouth, inside, 3d, cartoon, anime, sketches, worst quality, low quality, normal quality, lowres, normal quality, monochrome, grayscale, skin spots, acnes, skin blemishes, bad anatomy, red eyes, muscular"
+    - "easynegative, cartoon, anime, sketches, necklace, earrings worst quality, low quality, normal quality, bad anatomy, bad hands, shiny skin, error, missing fingers, extra digit, fewer digits, jpeg artifacts, signature, watermark, username, blurry, chubby, anorectic, bad eyes, old, wrinkled skin, red skin, photograph By bad artist -neg, big eyes, muscular face,"
+    - "beard, EasyNegative, lowres, chromatic aberration, depth of field, motion blur, blurry, bokeh, bad quality, worst quality, multiple arms, badhand"

configs/prompts/4-MajicMix.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+MajicMix:
+  base: ""
+  path: "models/DreamBooth_LoRA/majicmixRealistic_v5Preview.safetensors"
+  motion_module:
+    - "models/Motion_Module/mm_sd_v14.ckpt"
+    - "models/Motion_Module/mm_sd_v15.ckpt"
+  seed:           [1572448948722921032, 1099474677988590681, 6488833139725635347, 18339859844376517918]
+  steps:          25
+  guidance_scale: 7.5
+  prompt:
+    - "1girl, offshoulder, light smile, shiny skin best quality, masterpiece, photorealistic"
+    - "best quality, masterpiece, photorealistic, 1boy, 50 years old beard, dramatic lighting"
+    - "best quality, masterpiece, photorealistic, 1girl, light smile, shirt with collars, waist up, dramatic lighting, from below"
+    - "male, man, beard, bodybuilder, skinhead,cold face, tough guy, cowboyshot, tattoo, french windows, luxury hotel masterpiece, best quality, photorealistic"
+  n_prompt:
+    - "ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, watermark, moles"
+    - "nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome"
+    - "nsfw, ng_deepnegative_v1_75t,badhandv4, worst quality, low quality, normal quality, lowres,watermark, monochrome"
+    - "nude, nsfw, ng_deepnegative_v1_75t, badhandv4, worst quality, low quality, normal quality, lowres, bad anatomy, bad hands, monochrome, grayscale watermark, moles, people"

configs/prompts/5-RealisticVision.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+RealisticVision:
+  base: ""
+  path: "models/DreamBooth_LoRA/realisticVisionV20_v20.safetensors"
+  motion_module:
+    - "models/Motion_Module/mm_sd_v14.ckpt"
+    - "models/Motion_Module/mm_sd_v15.ckpt"
+  seed:           [5658137986800322009, 12099779162349365895, 10499524853910852697, 16768009035333711932]
+  steps:          25
+  guidance_scale: 7.5
+  prompt:
+    - "b&w photo of 42 y.o man in black clothes, bald, face, half body, body, high detailed skin, skin pores, coastline, overcast weather, wind, waves, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
+    - "close up photo of a rabbit, forest, haze, halation, bloom, dramatic atmosphere, centred, rule of thirds, 200mm 1.4f macro shot"
+    - "photo of coastline, rocks, storm weather, wind, waves, lightning, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
+    - "night, b&w photo of old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain"
+  n_prompt:
+    - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
+    - "semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck"
+    - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
+    - "blur, haze, deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, art, mutated hands and fingers, deformed, distorted, disfigured, poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"

configs/prompts/6-Tusun.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+Tusun:
+  base: "models/DreamBooth_LoRA/moonfilm_reality20.safetensors"
+  path: "models/DreamBooth_LoRA/TUSUN.safetensors"
+  motion_module:
+    - "models/Motion_Module/mm_sd_v14.ckpt"
+    - "models/Motion_Module/mm_sd_v15.ckpt"
+  seed:           [10154078483724687116, 2664393535095473805, 4231566096207622938, 1713349740448094493]
+  steps:          25
+  guidance_scale: 7.5
+  lora_alpha: 0.6
+  prompt:
+    - "tusuncub with its mouth open, blurry, open mouth, fangs, photo background, looking at viewer, tongue, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing"
+    - "cute tusun with a blurry background, black background, simple background, signature, face, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing"
+    - "cut tusuncub walking in the snow, blurry, looking at viewer, depth of field, blurry background, full body, solo, cute and lovely, Beautiful and realistic eye details, perfect anatomy, Nonsense, pure background, Centered-Shot, realistic photo, photograph, 4k, hyper detailed, DSLR, 24 Megapixels, 8mm Lens, Full Frame, film grain, Global Illumination, studio Lighting, Award Winning Photography, diffuse reflection, ray tracing"
+    - "character design, cyberpunk tusun kitten wearing astronaut suit, sci-fic, realistic eye color and details, fluffy, big head, science fiction, communist ideology, Cyborg, fantasy, intense angle, soft lighting, photograph, 4k, hyper detailed, portrait wallpaper, realistic, photo-realistic, DSLR, 24 Megapixels, Full Frame, vibrant details, octane render, finely detail, best quality, incredibly absurdres, robotic parts, rim light, vibrant details, luxurious cyberpunk, hyperrealistic, cable electric wires, microchip, full body"
+  n_prompt:
+    - "worst quality, low quality, deformed, distorted, disfigured, bad eyes, bad anatomy, disconnected limbs, wrong body proportions, low quality, worst quality, text, watermark, signatre, logo, illustration, painting, cartoons, ugly, easy_negative"

configs/prompts/7-FilmVelvia.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+FilmVelvia:
+  base: "models/DreamBooth_LoRA/majicmixRealistic_v5.safetensors"
+  path: "models/DreamBooth_LoRA/FilmVelvia2.safetensors"
+  motion_module:
+    - "models/Motion_Module/mm_sd_v14.ckpt"
+    - "models/Motion_Module/mm_sd_v15.ckpt"
+  seed:           [358675358833372813, 3519455280971923743, 11684545350557985081, 8696855302100399877]
+  steps:          25
+  guidance_scale: 7.5
+  lora_alpha: 0.6
+  prompt:
+    - "a woman standing on the side of a road at night,girl, long hair, motor vehicle, car, looking at viewer, ground vehicle, night, hands in pockets, blurry background, coat, black hair, parted lips, bokeh, jacket, brown hair, outdoors, red lips, upper body, artist name"
+    - ", dark shot,0mm, portrait quality of a arab man worker,boy, wasteland that stands out vividly against the background of the desert, barren landscape, closeup, moles skin, soft light, sharp, exposure blend, medium shot, bokeh, hdr, high contrast, cinematic, teal and orange5, muted colors, dim colors, soothing tones, low saturation, hyperdetailed, noir"
+    - "fashion photography portrait of 1girl, offshoulder, fluffy short hair, soft light, rim light, beautiful shadow, low key, photorealistic, raw photo, natural skin texture, realistic eye and face details, hyperrealism, ultra high res, 4K, Best quality, masterpiece, necklace, cleavage, in the dark"
+    - "In this lighthearted portrait, a woman is dressed as a fierce warrior, armed with an arsenal of paintbrushes and palette knives. Her war paint is composed of thick, vibrant strokes of color, and her armor is made of paint tubes and paint-splattered canvases. She stands victoriously atop a mountain of conquered blank canvases, with a beautiful, colorful landscape behind her, symbolizing the power of art and creativity. bust Portrait, close-up, Bright and transparent scene lighting, "
+  n_prompt:
+    - "cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
+    - "cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
+    - "wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
+    - "wrong white balance, dark, cartoon, anime, sketches,worst quality, low quality, deformed, distorted, disfigured, bad eyes, wrong lips, weird mouth, bad teeth, mutated hands and fingers, bad anatomy, wrong anatomy, amputation, extra limb, missing limb, floating limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"

configs/prompts/8-GhibliBackground.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+GhibliBackground:
+  base: "models/DreamBooth_LoRA/CounterfeitV30_25.safetensors"
+  path: "models/DreamBooth_LoRA/lora_Ghibli_n3.safetensors"
+  motion_module:
+    - "models/Motion_Module/mm_sd_v14.ckpt"
+    - "models/Motion_Module/mm_sd_v15.ckpt"
+  seed:           [8775748474469046618, 5893874876080607656, 11911465742147695752, 12437784838692000640]
+  steps:          25
+  guidance_scale: 7.5
+  lora_alpha: 1.0
+  prompt:
+    - "best quality,single build,architecture, blue_sky, building,cloudy_sky, day, fantasy, fence, field, house, build,architecture,landscape, moss, outdoors, overgrown, path, river, road, rock, scenery, sky, sword, tower, tree, waterfall"
+    - "black_border, building, city, day, fantasy, ice, landscape, letterboxed, mountain, ocean, outdoors, planet, scenery, ship, snow, snowing, water, watercraft, waterfall, winter"
+    - ",mysterious sea area, fantasy,build,concept"
+    - "Tomb Raider,Scenography,Old building"
+  n_prompt:
+    - "easynegative,bad_construction,bad_structure,bad_wail,bad_windows,blurry,cloned_window,cropped,deformed,disfigured,error,extra_windows,extra_chimney,extra_door,extra_structure,extra_frame,fewer_digits,fused_structure,gross_proportions,jpeg_artifacts,long_roof,low_quality,structure_limbs,missing_windows,missing_doors,missing_roofs,mutated_structure,mutation,normal_quality,out_of_frame,owres,poorly_drawn_structure,poorly_drawn_house,signature,text,too_many_windows,ugly,username,uta,watermark,worst_quality"

configs/training/image_finetune.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+image_finetune: true
+output_dir: "outputs"
+pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5"
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+  beta_start:          0.00085
+  beta_end:            0.012
+  beta_schedule:       "scaled_linear"
+  steps_offset:        1
+  clip_sample:         false
+train_data:
+  csv_path:     "/mnt/petrelfs/guoyuwei/projects/datasets/webvid/results_2M_val.csv"
+  video_folder: "/mnt/petrelfs/guoyuwei/projects/datasets/webvid/2M_val"
+  sample_size:  256
+validation_data:
+  prompts:
+    - "Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons."
+    - "A drone view of celebration with Christma tree and fireworks, starry sky - background."
+    - "Robot dancing in times square."
+    - "Pacific coast, carmel by the sea ocean and waves."
+  num_inference_steps: 25
+  guidance_scale: 8.
+trainable_modules:
+  - "."
+unet_checkpoint_path: ""
+learning_rate:    1.e-5
+train_batch_size: 50
+max_train_epoch:      -1
+max_train_steps:      100
+checkpointing_epochs: -1
+checkpointing_steps:  60
+validation_steps:       5000
+validation_steps_tuple: [2, 50]
+global_seed: 42
+mixed_precision_training: true
+enable_xformers_memory_efficient_attention: True
+is_debug: False