update with instantmesh pipeline

Files changed (4) hide show

model_index.json +2 -2
pipeline.py +547 -0
unet/config.json +0 -73
unet/diffusion_pytorch_model.safetensors +0 -3

model_index.json CHANGED Viewed

@@ -106,8 +106,8 @@
     "CLIPTokenizer"
   ],
   "unet": [
-    "diffusers",
-    "UNet2DConditionModel"
   ],
   "vae": [
     "diffusers",

     "CLIPTokenizer"
   ],
   "unet": [
+    null,
+    null
   ],
   "vae": [
     "diffusers",

pipeline.py ADDED Viewed

	@@ -0,0 +1,547 @@

+from typing import Any, Dict, Optional
+from diffusers.schedulers import KarrasDiffusionSchedulers
+import numpy
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+import torch.distributed
+import transformers
+from collections import OrderedDict
+from PIL import Image
+from torchvision import transforms
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from diffusers.utils import BaseOutput
+import rembg
+from torchvision.transforms import v2
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    EulerAncestralDiscreteScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models.attention_processor import (
+    Attention,
+    AttnProcessor,
+    XFormersAttnProcessor,
+    AttnProcessor2_0,
+)
+from diffusers.utils.import_utils import is_xformers_available
+def to_rgb_image(maybe_rgba: Image.Image):
+    if maybe_rgba.mode == "RGB":
+        return maybe_rgba
+    elif maybe_rgba.mode == "RGBA":
+        rgba = maybe_rgba
+        img = numpy.random.randint(
+            255, 256, size=[rgba.size[1], rgba.size[0], 3], dtype=numpy.uint8
+        )
+        img = Image.fromarray(img, "RGB")
+        img.paste(rgba, mask=rgba.getchannel("A"))
+        return img
+    else:
+        raise ValueError("Unsupported image type.", maybe_rgba.mode)
+class ReferenceOnlyAttnProc(torch.nn.Module):
+    def __init__(self, chained_proc, enabled=False, name=None) -> None:
+        super().__init__()
+        self.enabled = enabled
+        self.chained_proc = chained_proc
+        self.name = name
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        mode="w",
+        ref_dict: dict = None,
+        is_cfg_guidance=False,
+    ) -> Any:
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        if self.enabled and is_cfg_guidance:
+            res0 = self.chained_proc(
+                attn, hidden_states[:1], encoder_hidden_states[:1], attention_mask
+            )
+            hidden_states = hidden_states[1:]
+            encoder_hidden_states = encoder_hidden_states[1:]
+        if self.enabled:
+            if mode == "w":
+                ref_dict[self.name] = encoder_hidden_states
+            elif mode == "r":
+                encoder_hidden_states = torch.cat(
+                    [encoder_hidden_states, ref_dict.pop(self.name)], dim=1
+                )
+            elif mode == "m":
+                encoder_hidden_states = torch.cat(
+                    [encoder_hidden_states, ref_dict[self.name]], dim=1
+                )
+            else:
+                assert False, mode
+        res = self.chained_proc(
+            attn, hidden_states, encoder_hidden_states, attention_mask
+        )
+        if self.enabled and is_cfg_guidance:
+            res = torch.cat([res0, res])
+        return res
+class RefOnlyNoisedUNet(torch.nn.Module):
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        train_sched: DDPMScheduler,
+        val_sched: EulerAncestralDiscreteScheduler,
+    ) -> None:
+        super().__init__()
+        self.unet = unet
+        self.train_sched = train_sched
+        self.val_sched = val_sched
+        unet_lora_attn_procs = dict()
+        for name, _ in unet.attn_processors.items():
+            if torch.__version__ >= "2.0":
+                default_attn_proc = AttnProcessor2_0()
+            elif is_xformers_available():
+                default_attn_proc = XFormersAttnProcessor()
+            else:
+                default_attn_proc = AttnProcessor()
+            unet_lora_attn_procs[name] = ReferenceOnlyAttnProc(
+                default_attn_proc, enabled=name.endswith("attn1.processor"), name=name
+            )
+        unet.set_attn_processor(unet_lora_attn_procs)
+    def __getattr__(self, name: str):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.unet, name)
+    def forward_cond(
+        self,
+        noisy_cond_lat,
+        timestep,
+        encoder_hidden_states,
+        class_labels,
+        ref_dict,
+        is_cfg_guidance,
+        **kwargs,
+    ):
+        if is_cfg_guidance:
+            encoder_hidden_states = encoder_hidden_states[1:]
+            class_labels = class_labels[1:]
+        self.unet(
+            noisy_cond_lat,
+            timestep,
+            encoder_hidden_states=encoder_hidden_states,
+            class_labels=class_labels,
+            cross_attention_kwargs=dict(mode="w", ref_dict=ref_dict),
+            **kwargs,
+        )
+    def forward(
+        self,
+        sample,
+        timestep,
+        encoder_hidden_states,
+        class_labels=None,
+        *args,
+        cross_attention_kwargs,
+        down_block_res_samples=None,
+        mid_block_res_sample=None,
+        **kwargs,
+    ):
+        cond_lat = cross_attention_kwargs["cond_lat"]
+        is_cfg_guidance = cross_attention_kwargs.get("is_cfg_guidance", False)
+        noise = torch.randn_like(cond_lat)
+        if self.training:
+            noisy_cond_lat = self.train_sched.add_noise(cond_lat, noise, timestep)
+            noisy_cond_lat = self.train_sched.scale_model_input(
+                noisy_cond_lat, timestep
+            )
+        else:
+            noisy_cond_lat = self.val_sched.add_noise(
+                cond_lat, noise, timestep.reshape(-1)
+            )
+            noisy_cond_lat = self.val_sched.scale_model_input(
+                noisy_cond_lat, timestep.reshape(-1)
+            )
+        ref_dict = {}
+        self.forward_cond(
+            noisy_cond_lat,
+            timestep,
+            encoder_hidden_states,
+            class_labels,
+            ref_dict,
+            is_cfg_guidance,
+            **kwargs,
+        )
+        weight_dtype = self.unet.dtype
+        return self.unet(
+            sample,
+            timestep,
+            encoder_hidden_states,
+            *args,
+            class_labels=class_labels,
+            cross_attention_kwargs=dict(
+                mode="r", ref_dict=ref_dict, is_cfg_guidance=is_cfg_guidance
+            ),
+            down_block_additional_residuals=(
+                [sample.to(dtype=weight_dtype) for sample in down_block_res_samples]
+                if down_block_res_samples is not None
+                else None
+            ),
+            mid_block_additional_residual=(
+                mid_block_res_sample.to(dtype=weight_dtype)
+                if mid_block_res_sample is not None
+                else None
+            ),
+            **kwargs,
+        )
+def scale_latents(latents):
+    latents = (latents - 0.22) * 0.75
+    return latents
+def unscale_latents(latents):
+    latents = latents / 0.75 + 0.22
+    return latents
+def scale_image(image):
+    image = image * 0.5 / 0.8
+    return image
+def unscale_image(image):
+    image = image / 0.5 * 0.8
+    return image
+class DepthControlUNet(torch.nn.Module):
+    def __init__(
+        self,
+        unet: RefOnlyNoisedUNet,
+        controlnet: Optional[diffusers.ControlNetModel] = None,
+        conditioning_scale=1.0,
+    ) -> None:
+        super().__init__()
+        self.unet = unet
+        if controlnet is None:
+            self.controlnet = diffusers.ControlNetModel.from_unet(unet.unet)
+        else:
+            self.controlnet = controlnet
+        DefaultAttnProc = AttnProcessor2_0
+        if is_xformers_available():
+            DefaultAttnProc = XFormersAttnProcessor
+        self.controlnet.set_attn_processor(DefaultAttnProc())
+        self.conditioning_scale = conditioning_scale
+    def __getattr__(self, name: str):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.unet, name)
+    def forward(
+        self,
+        sample,
+        timestep,
+        encoder_hidden_states,
+        class_labels=None,
+        *args,
+        cross_attention_kwargs: dict,
+        **kwargs,
+    ):
+        cross_attention_kwargs = dict(cross_attention_kwargs)
+        control_depth = cross_attention_kwargs.pop("control_depth")
+        down_block_res_samples, mid_block_res_sample = self.controlnet(
+            sample,
+            timestep,
+            encoder_hidden_states=encoder_hidden_states,
+            controlnet_cond=control_depth,
+            conditioning_scale=self.conditioning_scale,
+            return_dict=False,
+        )
+        return self.unet(
+            sample,
+            timestep,
+            encoder_hidden_states=encoder_hidden_states,
+            down_block_res_samples=down_block_res_samples,
+            mid_block_res_sample=mid_block_res_sample,
+            cross_attention_kwargs=cross_attention_kwargs,
+        )
+class ModuleListDict(torch.nn.Module):
+    def __init__(self, procs: dict) -> None:
+        super().__init__()
+        self.keys = sorted(procs.keys())
+        self.values = torch.nn.ModuleList(procs[k] for k in self.keys)
+    def __getitem__(self, key):
+        return self.values[self.keys.index(key)]
+class SuperNet(torch.nn.Module):
+    def __init__(self, state_dict: Dict[str, torch.Tensor]):
+        super().__init__()
+        state_dict = OrderedDict((k, state_dict[k]) for k in sorted(state_dict.keys()))
+        self.layers = torch.nn.ModuleList(state_dict.values())
+        self.mapping = dict(enumerate(state_dict.keys()))
+        self.rev_mapping = {v: k for k, v in enumerate(state_dict.keys())}
+        # .processor for unet, .self_attn for text encoder
+        self.split_keys = [".processor", ".self_attn"]
+        # we add a hook to state_dict() and load_state_dict() so that the
+        # naming fits with `unet.attn_processors`
+        def map_to(module, state_dict, *args, **kwargs):
+            new_state_dict = {}
+            for key, value in state_dict.items():
+                num = int(key.split(".")[1])  # 0 is always "layers"
+                new_key = key.replace(f"layers.{num}", module.mapping[num])
+                new_state_dict[new_key] = value
+            return new_state_dict
+        def remap_key(key, state_dict):
+            for k in self.split_keys:
+                if k in key:
+                    return key.split(k)[0] + k
+            return key.split(".")[0]
+        def map_from(module, state_dict, *args, **kwargs):
+            all_keys = list(state_dict.keys())
+            for key in all_keys:
+                replace_key = remap_key(key, state_dict)
+                new_key = key.replace(
+                    replace_key, f"layers.{module.rev_mapping[replace_key]}"
+                )
+                state_dict[new_key] = state_dict[key]
+                del state_dict[key]
+        self._register_state_dict_hook(map_to)
+        self._register_load_state_dict_pre_hook(map_from, with_module=True)
+class Zero123PlusPipelineOutput(BaseOutput):
+    images: torch.Tensor
+class Zero123PlusPipeline(diffusers.StableDiffusionPipeline):
+    tokenizer: transformers.CLIPTokenizer
+    text_encoder: transformers.CLIPTextModel
+    vision_encoder: transformers.CLIPVisionModelWithProjection
+    feature_extractor_clip: transformers.CLIPImageProcessor
+    unet: UNet2DConditionModel
+    scheduler: diffusers.schedulers.KarrasDiffusionSchedulers
+    vae: AutoencoderKL
+    ramping: nn.Linear
+    feature_extractor_vae: transformers.CLIPImageProcessor
+    depth_transforms_multi = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]
+    )
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        vision_encoder: transformers.CLIPVisionModelWithProjection,
+        feature_extractor_clip: CLIPImageProcessor,
+        feature_extractor_vae: CLIPImageProcessor,
+        ramping_coefficients: Optional[list] = None,
+        safety_checker=None,
+    ):
+        DiffusionPipeline.__init__(self)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=None,
+            vision_encoder=vision_encoder,
+            feature_extractor_clip=feature_extractor_clip,
+            feature_extractor_vae=feature_extractor_vae,
+        )
+        self.register_to_config(ramping_coefficients=ramping_coefficients)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def prepare(self):
+        train_sched = DDPMScheduler.from_config(self.scheduler.config)
+        if isinstance(self.unet, UNet2DConditionModel):
+            self.unet = RefOnlyNoisedUNet(self.unet, train_sched, self.scheduler).eval()
+    def add_controlnet(
+        self,
+        controlnet: Optional[diffusers.ControlNetModel] = None,
+        conditioning_scale=1.0,
+    ):
+        self.prepare()
+        self.unet = DepthControlUNet(self.unet, controlnet, conditioning_scale)
+        return SuperNet(OrderedDict([("controlnet", self.unet.controlnet)]))
+    def encode_condition_image(self, image: torch.Tensor):
+        image = self.vae.encode(image).latent_dist.sample()
+        return image
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Image.Image = None,
+        prompt="",
+        *args,
+        num_images_per_prompt: Optional[int] = 1,
+        guidance_scale=4.0,
+        depth_image: Image.Image = None,
+        output_type: Optional[str] = "pil",
+        width=640,
+        height=960,
+        num_inference_steps=28,
+        return_dict=True,
+        **kwargs,
+    ):
+        self.prepare()
+        if image is None:
+            raise ValueError(
+                "Inputting embeddings not supported for this pipeline. Please pass an image."
+            )
+        assert not isinstance(image, torch.Tensor)
+        image = rembg.remove(image)
+        image = numpy.array(image)
+        alpha = numpy.where(image[..., 3] > 0)
+        y1, y2, x1, x2 = (
+            alpha[0].min(),
+            alpha[0].max(),
+            alpha[1].min(),
+            alpha[1].max(),
+        )
+        fg = image[y1:y2, x1:x2]
+        size = max(fg.shape[0], fg.shape[1])
+        ph0, pw0 = (size - fg.shape[0]) // 2, (size - fg.shape[1]) // 2
+        ph1, pw1 = size - fg.shape[0] - ph0, size - fg.shape[1] - pw0
+        image = numpy.pad(
+            fg,
+            ((ph0, ph1), (pw0, pw1), (0, 0)),
+            mode="constant",
+            constant_values=((0, 0), (0, 0), (0, 0)),
+        )
+        new_size = int(image.shape[0] / 0.85)
+        ph0, pw0 = (new_size - size) // 2, (new_size - size) // 2
+        ph1, pw1 = new_size - size - ph0, new_size - size - pw0
+        image = numpy.pad(
+            image,
+            ((ph0, ph1), (pw0, pw1), (0, 0)),
+            mode="constant",
+            constant_values=((0, 0), (0, 0), (0, 0)),
+        )
+        image = Image.fromarray(image)
+        # images = mv_pipeline(image).images[0]
+        image = to_rgb_image(image)
+        image_1 = self.feature_extractor_vae(
+            images=image, return_tensors="pt"
+        ).pixel_values
+        image_2 = self.feature_extractor_clip(
+            images=image, return_tensors="pt"
+        ).pixel_values
+        if depth_image is not None and hasattr(self.unet, "controlnet"):
+            depth_image = to_rgb_image(depth_image)
+            depth_image = self.depth_transforms_multi(depth_image).to(
+                device=self.unet.controlnet.device, dtype=self.unet.controlnet.dtype
+            )
+        image = image_1.to(device=self.vae.device, dtype=self.vae.dtype)
+        image_2 = image_2.to(device=self.vae.device, dtype=self.vae.dtype)
+        cond_lat = self.encode_condition_image(image)
+        if guidance_scale > 1:
+            negative_lat = self.encode_condition_image(torch.zeros_like(image))
+            cond_lat = torch.cat([negative_lat, cond_lat])
+        encoded = self.vision_encoder(image_2, output_hidden_states=False)
+        global_embeds = encoded.image_embeds
+        global_embeds = global_embeds.unsqueeze(-2)
+        if hasattr(self, "encode_prompt"):
+            encoder_hidden_states = self.encode_prompt(
+                prompt, self.device, num_images_per_prompt, False
+            )[0]
+        else:
+            encoder_hidden_states = self._encode_prompt(
+                prompt, self.device, num_images_per_prompt, False
+            )
+        ramp = global_embeds.new_tensor(self.config.ramping_coefficients).unsqueeze(-1)
+        encoder_hidden_states = encoder_hidden_states + global_embeds * ramp
+        cak = dict(cond_lat=cond_lat)
+        if hasattr(self.unet, "controlnet"):
+            cak["control_depth"] = depth_image
+        latents: torch.Tensor = (
+            super()
+            .__call__(
+                None,
+                *args,
+                cross_attention_kwargs=cak,
+                guidance_scale=guidance_scale,
+                num_images_per_prompt=num_images_per_prompt,
+                prompt_embeds=encoder_hidden_states,
+                num_inference_steps=num_inference_steps,
+                output_type="latent",
+                width=width,
+                height=height,
+                **kwargs,
+            )
+            .images
+        )
+        latents = unscale_latents(latents)
+        if not output_type == "latent":
+            image = unscale_image(
+                self.vae.decode(
+                    latents / self.vae.config.scaling_factor, return_dict=False
+                )[0]
+            )
+        else:
+            image = latents
+        image = self.image_processor.postprocess(image, output_type=output_type)
+        if not return_dict:
+            return (image,)
+        images = numpy.asarray(image[0], dtype=numpy.float32) / 255.0
+        images = torch.from_numpy(images).permute(2, 0, 1).contiguous().float()
+        n, m = 3, 2
+        c, h, w = images.shape
+        images = (
+            images.view(c, n, h // n, m, w // m).permute(1, 3, 0, 2, 4).contiguous()
+        )
+        images = images.view(n * m, c, h // n, w // m)
+        images = images.unsqueeze(0)
+        images = v2.functional.resize(
+            images, 320, interpolation=3, antialias=True
+        ).clamp(0, 1)
+        return Zero123PlusPipelineOutput(images=images)

unet/config.json DELETED Viewed

@@ -1,73 +0,0 @@
-{
-  "_class_name": "UNet2DConditionModel",
-  "_diffusers_version": "0.30.3",
-  "_name_or_path": "/home/dylan/.cache/huggingface/hub/models--sudo-ai--zero123plus-v1.2/snapshots/2da07e89919e1a130c9b5add1584c70c7aa065fd/unet",
-  "act_fn": "silu",
-  "addition_embed_type": null,
-  "addition_embed_type_num_heads": 64,
-  "addition_time_embed_dim": null,
-  "attention_head_dim": [
-    5,
-    10,
-    20,
-    20
-  ],
-  "attention_type": "default",
-  "block_out_channels": [
-    320,
-    640,
-    1280,
-    1280
-  ],
-  "center_input_sample": false,
-  "class_embed_type": null,
-  "class_embeddings_concat": false,
-  "conv_in_kernel": 3,
-  "conv_out_kernel": 3,
-  "cross_attention_dim": 1024,
-  "cross_attention_norm": null,
-  "down_block_types": [
-    "CrossAttnDownBlock2D",
-    "CrossAttnDownBlock2D",
-    "CrossAttnDownBlock2D",
-    "DownBlock2D"
-  ],
-  "downsample_padding": 1,
-  "dropout": 0.0,
-  "dual_cross_attention": false,
-  "encoder_hid_dim": null,
-  "encoder_hid_dim_type": null,
-  "flip_sin_to_cos": true,
-  "freq_shift": 0,
-  "in_channels": 4,
-  "layers_per_block": 2,
-  "mid_block_only_cross_attention": null,
-  "mid_block_scale_factor": 1,
-  "mid_block_type": "UNetMidBlock2DCrossAttn",
-  "norm_eps": 1e-05,
-  "norm_num_groups": 32,
-  "num_attention_heads": null,
-  "num_class_embeds": null,
-  "only_cross_attention": false,
-  "out_channels": 4,
-  "projection_class_embeddings_input_dim": null,
-  "resnet_out_scale_factor": 1.0,
-  "resnet_skip_time_act": false,
-  "resnet_time_scale_shift": "default",
-  "reverse_transformer_layers_per_block": null,
-  "sample_size": 96,
-  "time_cond_proj_dim": null,
-  "time_embedding_act_fn": null,
-  "time_embedding_dim": null,
-  "time_embedding_type": "positional",
-  "timestep_post_act": null,
-  "transformer_layers_per_block": 1,
-  "up_block_types": [
-    "UpBlock2D",
-    "CrossAttnUpBlock2D",
-    "CrossAttnUpBlock2D",
-    "CrossAttnUpBlock2D"
-  ],
-  "upcast_attention": false,
-  "use_linear_projection": true
-}

unet/diffusion_pytorch_model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c4cba18336cfeb369d18dca0b1af3b9268302d828d7eee871d22074d08b91b33
-size 1731904736