kiigii
/

imagedream-ipmv-diffusers

Diffusers

Safetensors

Model card Files Files and versions

xet

Community

kiigii commited on Jun 14, 2024

Commit

8f6ec56

verified ·

1 Parent(s): 5684c31

Update pipeline_imagedream.py

Browse files

Files changed (1) hide show

pipeline_imagedream.py +154 -3

pipeline_imagedream.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import Any, Callable, Dict, List, Optional, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -17,6 +18,8 @@ except:
 from diffusers.image_processor import PipelineImageInput
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion.pipeline_output import (
     StableDiffusionPipelineOutput,
 )
@@ -37,9 +40,6 @@ from transformers import (
     CLIPVisionModel,
 )
-from attention_processor import add_imagedream_attn_processor
-from camera_utils import get_camera
 class ImageDreamPipeline(StableDiffusionPipeline):
     def __init__(
@@ -417,3 +417,154 @@ class ImageDreamPipeline(StableDiffusionPipeline):
         return StableDiffusionPipelineOutput(
             images=image, nsfw_content_detected=has_nsfw_concept
         )

 from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from diffusers.image_processor import PipelineImageInput
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.attention import Attention
+from diffusers.models.attention_processor import AttnProcessor2_0
 from diffusers.pipelines.stable_diffusion.pipeline_output import (
     StableDiffusionPipelineOutput,
 )
     CLIPVisionModel,
 )
 class ImageDreamPipeline(StableDiffusionPipeline):
     def __init__(
         return StableDiffusionPipelineOutput(
             images=image, nsfw_content_detected=has_nsfw_concept
         )
+# fmt: off
+# Copied from ImageDream
+# https://github.com/bytedance/ImageDream/blob/main/extern/ImageDream/imagedream/camera_utils.py
+def create_camera_to_world_matrix(elevation, azimuth):
+    elevation = np.radians(elevation)
+    azimuth = np.radians(azimuth)
+    # Convert elevation and azimuth angles to Cartesian coordinates on a unit sphere
+    x = np.cos(elevation) * np.sin(azimuth)
+    y = np.sin(elevation)
+    z = np.cos(elevation) * np.cos(azimuth)
+    # Calculate camera position, target, and up vectors
+    camera_pos = np.array([x, y, z])
+    target = np.array([0, 0, 0])
+    up = np.array([0, 1, 0])
+    # Construct view matrix
+    forward = target - camera_pos
+    forward /= np.linalg.norm(forward)
+    right = np.cross(forward, up)
+    right /= np.linalg.norm(right)
+    new_up = np.cross(right, forward)
+    new_up /= np.linalg.norm(new_up)
+    cam2world = np.eye(4)
+    cam2world[:3, :3] = np.array([right, new_up, -forward]).T
+    cam2world[:3, 3] = camera_pos
+    return cam2world
+def convert_opengl_to_blender(camera_matrix):
+    if isinstance(camera_matrix, np.ndarray):
+        # Construct transformation matrix to convert from OpenGL space to Blender space
+        flip_yz = np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
+        camera_matrix_blender = np.dot(flip_yz, camera_matrix)
+    else:
+        # Construct transformation matrix to convert from OpenGL space to Blender space
+        flip_yz = torch.tensor(
+            [[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]
+        )
+        if camera_matrix.ndim == 3:
+            flip_yz = flip_yz.unsqueeze(0)
+        camera_matrix_blender = torch.matmul(flip_yz.to(camera_matrix), camera_matrix)
+    return camera_matrix_blender
+def normalize_camera(camera_matrix):
+    """normalize the camera location onto a unit-sphere"""
+    if isinstance(camera_matrix, np.ndarray):
+        camera_matrix = camera_matrix.reshape(-1, 4, 4)
+        translation = camera_matrix[:, :3, 3]
+        translation = translation / (
+            np.linalg.norm(translation, axis=1, keepdims=True) + 1e-8
+        )
+        camera_matrix[:, :3, 3] = translation
+    else:
+        camera_matrix = camera_matrix.reshape(-1, 4, 4)
+        translation = camera_matrix[:, :3, 3]
+        translation = translation / (
+            torch.norm(translation, dim=1, keepdim=True) + 1e-8
+        )
+        camera_matrix[:, :3, 3] = translation
+    return camera_matrix.reshape(-1, 16)
+def get_camera(
+    num_frames,
+    elevation=15,
+    azimuth_start=0,
+    azimuth_span=360,
+    blender_coord=True,
+    extra_view=False,
+):
+    angle_gap = azimuth_span / num_frames
+    cameras = []
+    for azimuth in np.arange(azimuth_start, azimuth_span + azimuth_start, angle_gap):
+        camera_matrix = create_camera_to_world_matrix(elevation, azimuth)
+        if blender_coord:
+            camera_matrix = convert_opengl_to_blender(camera_matrix)
+        cameras.append(camera_matrix.flatten())
+    if extra_view:
+        dim = len(cameras[0])
+        cameras.append(np.zeros(dim))
+    return torch.tensor(np.stack(cameras, 0)).float()
+# fmt: on
+def add_imagedream_attn_processor(unet: UNet2DConditionModel) -> nn.Module:
+    attn_procs = {}
+    for key, attn_processor in unet.attn_processors.items():
+        if "attn1" in key:
+            attn_procs[key] = ImageDreamAttnProcessor2_0()
+        else:
+            attn_procs[key] = attn_processor
+    unet.set_attn_processor(attn_procs)
+    return unet
+class ImageDreamAttnProcessor2_0(AttnProcessor2_0):
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        num_views: int = 1,
+        *args,
+        **kwargs,
+    ):
+        if num_views == 1:
+            return super().__call__(
+                attn=attn,
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                temb=temb,
+                *args,
+                **kwargs,
+            )
+        input_ndim = hidden_states.ndim
+        B = hidden_states.size(0)
+        if B % num_views:
+            raise ValueError(
+                f"`batch_size`(got {B}) must be a multiple of `num_views`(got {num_views})."
+            )
+        real_B = B // num_views
+        if input_ndim == 4:
+            H, W = hidden_states.shape[2:]
+            hidden_states = hidden_states.reshape(real_B, -1, H, W).transpose(1, 2)
+        else:
+            hidden_states = hidden_states.reshape(real_B, -1, hidden_states.size(-1))
+        hidden_states = super().__call__(
+            attn=attn,
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            temb=temb,
+            *args,
+            **kwargs,
+        )
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(B, -1, H, W)
+        else:
+            hidden_states = hidden_states.reshape(B, -1, hidden_states.size(-1))
+        return hidden_states