Spaces:

prs-eth
/

marigold-intrinsics

Running on Zero

App Files Files Community

toshas commited on May 13

Commit

e926577

1 Parent(s): 558cb45

add header link to self for embedding

Browse files

add badge for diffusers tutorial
bump to the latest diffusers

Files changed (4) hide show

app.py +27 -44
marigold_iid_appearance.py +0 -561
marigold_iid_lighting.py +0 -576
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -30,22 +30,16 @@
 # --------------------------------------------------------------------------
 import os
-import numpy as np
 os.system("pip freeze")
 import spaces
 import gradio as gr
 import torch as torch
-from diffusers import DDIMScheduler
 from gradio_dualvision import DualVisionApp
 from huggingface_hub import login
 from PIL import Image
-from marigold_iid_appearance import MarigoldIIDAppearancePipeline
-from marigold_iid_lighting import MarigoldIIDLightingPipeline
 CHECKPOINT_APPEARANCE = "prs-eth/marigold-iid-appearance-v1-1"
 CHECKPOINT_LIGHTING = "prs-eth/marigold-iid-lighting-v1-1"
@@ -55,19 +49,11 @@ if "HF_TOKEN_LOGIN" in os.environ:
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-pipe_appearance = MarigoldIIDAppearancePipeline.from_pretrained(
-    "prs-eth/marigold-iid-appearance-v1-1"
-)
-pipe_appearance.scheduler = DDIMScheduler.from_config(
-    pipe_appearance.scheduler.config, timestep_spacing="trailing"
-)
 pipe_appearance = pipe_appearance.to(device=device, dtype=dtype)
-pipe_lighting = MarigoldIIDLightingPipeline.from_pretrained(
-    "prs-eth/marigold-iid-lighting-v1-1"
-)
-pipe_lighting.scheduler = DDIMScheduler.from_config(
-    pipe_lighting.scheduler.config, timestep_spacing="trailing"
-)
 pipe_lighting = pipe_lighting.to(device=device, dtype=dtype)
 try:
     import xformers
@@ -87,7 +73,7 @@ class MarigoldIIDApp(DualVisionApp):
     def make_header(self):
         gr.Markdown(
             """
-            ## Marigold Intrinsic Image Decomposition
             """
         )
         with gr.Row(elem_classes="remove-elements"):
@@ -97,6 +83,9 @@ class MarigoldIIDApp(DualVisionApp):
                 <a title="Website" href="https://marigoldmonodepth.github.io/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
                     <img src="https://img.shields.io/badge/%E2%99%A5%20Project%20-Website-blue">
                 </a>
                 <a title="arXiv" href="https://arxiv.org/abs/2312.02145" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
                     <img src="https://img.shields.io/badge/%F0%9F%93%84%20Read%20-Paper-AF3436">
                 </a>
@@ -156,7 +145,7 @@ class MarigoldIIDApp(DualVisionApp):
         ensemble_size = kwargs.get("ensemble_size", self.DEFAULT_ENSEMBLE_SIZE)
         denoise_steps = kwargs.get("denoise_steps", self.DEFAULT_DENOISE_STEPS)
         processing_res = kwargs.get("processing_res", self.DEFAULT_PROCESSING_RES)
-        # generator = torch.Generator(device=device).manual_seed(self.DEFAULT_SEED)
         pipe_out_appearance = pipe_appearance(
             image_in,
@@ -165,19 +154,12 @@ class MarigoldIIDApp(DualVisionApp):
             processing_resolution=processing_res,
             batch_size=1 if processing_res == 0 else 2,
             output_uncertainty=ensemble_size >= 3,
-            # generator=generator,
-            seed=self.DEFAULT_SEED,
         )
-        roughness = pipe_out_appearance.material[0].clip(-1, 1)
-        roughness = (roughness + 1.0) * 0.5
-        roughness = (roughness * 65535).astype(np.uint16)
-        roughness = Image.fromarray(roughness, mode="I;16")
-        metallicity = pipe_out_appearance.material[1].clip(-1, 1)
-        metallicity = (metallicity + 1.0) * 0.5
-        metallicity = (metallicity * 65535).astype(np.uint16)
-        metallicity = Image.fromarray(metallicity, mode="I;16")
         pipe_out_lighting = pipe_lighting(
             image_in,
@@ -186,22 +168,23 @@ class MarigoldIIDApp(DualVisionApp):
             processing_resolution=processing_res,
             batch_size=1 if processing_res == 0 else 2,
             output_uncertainty=ensemble_size >= 3,
-            # generator=generator,
-            seed=self.DEFAULT_SEED,
         )
         out_modalities = {
-            "Albedo": pipe_out_appearance.albedo_colored,
-            "Materials": pipe_out_appearance.material_colored,
-            "Roughness": roughness,
-            "Metallicity": metallicity,
-            "Albedo (HyperSim)": pipe_out_lighting.albedo_colored,
-            "Shading (HyperSim)": pipe_out_lighting.shading_colored,
-            "Residual (HyperSim)": pipe_out_lighting.residual_colored,
         }
-        # if ensemble_size >= 3:
-        #     uncertainty = pipe.image_processor.visualize_uncertainty(pipe_out.uncertainty)[0]
-        #     out_modalities["Uncertainty"] = uncertainty
         out_settings = {
             "ensemble_size": ensemble_size,

 # --------------------------------------------------------------------------
 import os
 os.system("pip freeze")
 import spaces
 import gradio as gr
 import torch as torch
+from diffusers import MarigoldIntrinsicsPipeline, DDIMScheduler
 from gradio_dualvision import DualVisionApp
 from huggingface_hub import login
 from PIL import Image
 CHECKPOINT_APPEARANCE = "prs-eth/marigold-iid-appearance-v1-1"
 CHECKPOINT_LIGHTING = "prs-eth/marigold-iid-lighting-v1-1"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+pipe_appearance = MarigoldIntrinsicsPipeline.from_pretrained(CHECKPOINT_APPEARANCE)
+pipe_appearance.scheduler = DDIMScheduler.from_config(pipe_appearance.scheduler.config, timestep_spacing="trailing")
 pipe_appearance = pipe_appearance.to(device=device, dtype=dtype)
+pipe_lighting = MarigoldIntrinsicsPipeline.from_pretrained(CHECKPOINT_LIGHTING)
+pipe_lighting.scheduler = DDIMScheduler.from_config(pipe_lighting.scheduler.config, timestep_spacing="trailing")
 pipe_lighting = pipe_lighting.to(device=device, dtype=dtype)
 try:
     import xformers
     def make_header(self):
         gr.Markdown(
             """
+            ## [Marigold Intrinsic Image Decomposition](https://huggingface.co/spaces/prs-eth/marigold-intrinsics)
             """
         )
         with gr.Row(elem_classes="remove-elements"):
                 <a title="Website" href="https://marigoldmonodepth.github.io/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
                     <img src="https://img.shields.io/badge/%E2%99%A5%20Project%20-Website-blue">
                 </a>
+                <a title="diffusers" href="https://huggingface.co/docs/diffusers/using-diffusers/marigold_usage" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                    <img src="https://img.shields.io/badge/%F0%9F%A7%A8%20Read_diffusers-tutorial-yellow?labelColor=green">
+                </a>
                 <a title="arXiv" href="https://arxiv.org/abs/2312.02145" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
                     <img src="https://img.shields.io/badge/%F0%9F%93%84%20Read%20-Paper-AF3436">
                 </a>
         ensemble_size = kwargs.get("ensemble_size", self.DEFAULT_ENSEMBLE_SIZE)
         denoise_steps = kwargs.get("denoise_steps", self.DEFAULT_DENOISE_STEPS)
         processing_res = kwargs.get("processing_res", self.DEFAULT_PROCESSING_RES)
+        generator = torch.Generator(device=device).manual_seed(self.DEFAULT_SEED)
         pipe_out_appearance = pipe_appearance(
             image_in,
             processing_resolution=processing_res,
             batch_size=1 if processing_res == 0 else 2,
             output_uncertainty=ensemble_size >= 3,
+            generator=generator,
         )
+        iid_appearance_vis = pipe_appearance.image_processor.visualize_intrinsics(
+            pipe_out_appearance.prediction, pipe_appearance.target_properties
+        )
         pipe_out_lighting = pipe_lighting(
             image_in,
             processing_resolution=processing_res,
             batch_size=1 if processing_res == 0 else 2,
             output_uncertainty=ensemble_size >= 3,
+            generator=generator,
+        )
+        iid_lighting_vis = pipe_lighting.image_processor.visualize_intrinsics(
+            pipe_out_lighting.prediction, pipe_lighting.target_properties
         )
         out_modalities = {
+            "Albedo": iid_appearance_vis[0]["albedo"],
+            "Materials": iid_appearance_vis[0]["material"],
+            "Roughness": iid_appearance_vis[0]["roughness"],
+            "Metallicity": iid_appearance_vis[0]["metallicity"],
+            "Albedo (HyperSim)": iid_lighting_vis[0]["albedo"],
+            "Shading (HyperSim)": iid_lighting_vis[0]["shading"],
+            "Residual (HyperSim)": iid_lighting_vis[0]["residual"],
         }
+        # Additionally, uncertainty can be computed on any of the output modalities; we skip it to keep the demo light
         out_settings = {
             "ensemble_size": ensemble_size,

marigold_iid_appearance.py DELETED Viewed

@@ -1,561 +0,0 @@
-# Copyright 2024 Anton Obukhov, Bingxin Ke, Bo Li & Kevin Qu, ETH Zurich and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# --------------------------------------------------------------------------
-# If you find this code useful, we kindly ask you to cite our paper in your work.
-# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
-# More information about the method can be found at https://marigoldcomputervision.github.io
-# --------------------------------------------------------------------------
-import logging
-import math
-from typing import Optional, Tuple, Union, Dict, Any
-import numpy as np
-import torch
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DiffusionPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils import BaseOutput, check_min_version
-from PIL import Image
-from PIL.Image import Resampling
-from torch.utils.data import DataLoader, TensorDataset
-from tqdm.auto import tqdm
-from transformers import CLIPTextModel, CLIPTokenizer
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
-class MarigoldIIDAppearanceOutput(BaseOutput):
-    """
-    Output class for Marigold IID Appearance pipeline.
-    Args:
-        albedo (`np.ndarray`):
-            Predicted albedo map with the shape of [3, H, W] values in the range of [0, 1].
-        albedo_colored (`PIL.Image.Image`):
-            Colorized albedo map with the shape of [H, W, 3].
-        material (`np.ndarray`):
-            Predicted material map with the shape of [3, H, W] and values in [0, 1].
-            1st channel (Red) is roughness
-            2nd channel (Green) is metallicity
-            3rd channel (Blue) is empty (zero)
-        material_colored (`PIL.Image.Image`):
-            Colorized material map with the shape of [H, W, 3].
-            1st channel (Red) is roughness
-            2nd channel (Green) is metallicity
-            3rd channel (Blue) is empty (zero)
-    """
-    albedo: np.ndarray
-    albedo_colored: Image.Image
-    material: np.ndarray
-    material_colored: Image.Image
-class MarigoldIIDAppearancePipeline(DiffusionPipeline):
-    """
-    Pipeline for Intrinsic Image Decomposition (Albedo and Material) using Marigold: https://marigoldcomputervision.github.io.
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-    Args:
-        unet (`UNet2DConditionModel`):
-            Conditional U-Net to denoise the normals latent, conditioned on image latent.
-        vae (`AutoencoderKL`):
-            Variational Auto-Encoder (VAE) Model to encode and decode images and normals maps
-            to and from latent representations.
-        scheduler (`DDIMScheduler`):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
-        text_encoder (`CLIPTextModel`):
-            Text-encoder, for empty text embedding.
-        tokenizer (`CLIPTokenizer`):
-            CLIP tokenizer.
-    """
-    latent_scale_factor = 0.18215
-    def __init__(
-        self,
-        unet: UNet2DConditionModel,
-        vae: AutoencoderKL,
-        scheduler: DDIMScheduler,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        prediction_type: Optional[str] = None,
-        target_properties: Optional[Dict[str, Any]] = None,
-        default_denoising_steps: Optional[int] = None,
-        default_processing_resolution: Optional[int] = None,
-    ):
-        super().__init__()
-        self.register_modules(
-            unet=unet,
-            vae=vae,
-            scheduler=scheduler,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-        )
-        self.register_to_config(
-            prediction_type=prediction_type,
-            target_properties=target_properties,
-            default_denoising_steps=default_denoising_steps,
-            default_processing_resolution=default_processing_resolution,
-        )
-        self.empty_text_embed = None
-        self.n_targets = 2  # Albedo and material
-    @torch.no_grad()
-    def __call__(
-        self,
-        input_image: Image,
-        denoising_steps: int = 4,
-        ensemble_size: int = 10,
-        processing_res: int = 768,
-        match_input_res: bool = True,
-        resample_method: str = "bilinear",
-        batch_size: int = 0,
-        save_memory: bool = False,
-        seed: Union[int, None] = None,
-        color_map: str = "Spectral",  # TODO change colorization api based on modality
-        show_progress_bar: bool = True,
-        **kwargs,
-    ) -> MarigoldIIDAppearanceOutput:
-        """
-        Function invoked when calling the pipeline.
-        Args:
-            input_image (`Image`):
-                Input RGB (or gray-scale) image.
-            denoising_steps (`int`, *optional*, defaults to `10`):
-                Number of diffusion denoising steps (DDIM) during inference.
-            ensemble_size (`int`, *optional*, defaults to `10`):
-                Number of predictions to be ensembled.
-            processing_res (`int`, *optional*, defaults to `768`):
-                Maximum resolution of processing.
-                If set to 0: will not resize at all.
-            match_input_res (`bool`, *optional*, defaults to `True`):
-                Resize normals prediction to match input resolution.
-                Only valid if `limit_input_res` is not None.
-            resample_method: (`str`, *optional*, defaults to `bilinear`):
-                Resampling method used to resize images and depth predictions. This can be one of `bilinear`, `bicubic` or `nearest`, defaults to: `bilinear`.
-            batch_size (`int`, *optional*, defaults to `0`):
-                Inference batch size, no bigger than `num_ensemble`.
-                If set to 0, the script will automatically decide the proper batch size.
-            save_memory (`bool`, defaults to `False`):
-                Extra steps to save memory at the cost of perforance.
-            seed (`int`, *optional*, defaults to `None`)
-                Reproducibility seed.
-            color_map (`str`, *optional*, defaults to `"Spectral"`, pass `None` to skip colorized normals map generation):
-                Colormap used to colorize the normals map.
-            show_progress_bar (`bool`, *optional*, defaults to `True`):
-                Display a progress bar of diffusion denoising.
-        Returns:
-            `MarigoldIIDAppearanceOutput`: Output class for Marigold monocular intrinsic image decomposition (appearance) prediction pipeline, including:
-            - **albedo** (`np.ndarray`) Predicted albedo map with the shape of [3, H, W] values in the range of [0, 1]
-            - **albedo_colored** (`PIL.Image.Image`) Colorized albedo map with the shape of [3, H, W] values in the range of [0, 1]
-            - **material** (`np.ndarray`) Predicted material map with the shape of [3, H, W] and values in [0, 1]
-            - **material_colored** (`PIL.Image.Image`) Colorized material map with the shape of [3, H, W] and values in [0, 1]
-        """
-        if not match_input_res:
-            assert processing_res is not None
-        assert processing_res >= 0
-        assert denoising_steps >= 1
-        assert ensemble_size >= 1
-        # Check if denoising step is reasonable
-        self.check_inference_step(denoising_steps)
-        resample_method: Resampling = self.get_pil_resample_method(resample_method)
-        W, H = input_image.size
-        if processing_res > 0:
-            input_image = self.resize_max_res(
-                input_image,
-                max_edge_resolution=processing_res,
-                resample_method=resample_method,
-            )
-        input_image = input_image.convert("RGB")
-        image = np.asarray(input_image)
-        rgb = np.transpose(image, (2, 0, 1))  # [H, W, rgb] -> [rgb, H, W]
-        rgb_norm = rgb / 255.0 * 2.0 - 1.0  #  [0, 255] -> [-1, 1]
-        rgb_norm = torch.from_numpy(rgb_norm).to(self.dtype)
-        rgb_norm = rgb_norm.to(self.device)
-        assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0  # TODO remove this
-        def ensemble(
-            targets: torch.Tensor,
-            return_uncertainty: bool = False,
-            reduction="median",
-        ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-            uncertainty = None
-            if reduction == "mean":
-                prediction = torch.mean(targets, dim=0, keepdim=True)
-                if return_uncertainty:
-                    uncertainty = torch.std(targets, dim=0, keepdim=True)
-            elif reduction == "median":
-                prediction = torch.median(targets, dim=0, keepdim=True).values
-                if return_uncertainty:
-                    uncertainty = torch.median(
-                        torch.abs(targets - prediction), dim=0, keepdim=True
-                    ).values
-            else:
-                raise ValueError(f"Unrecognized reduction method: {reduction}.")
-            return prediction, uncertainty
-        duplicated_rgb = torch.stack([rgb_norm] * ensemble_size)
-        single_rgb_dataset = TensorDataset(duplicated_rgb)
-        if batch_size <= 0:
-            batch_size = self.find_batch_size(
-                ensemble_size=ensemble_size,
-                input_res=max(rgb_norm.shape[1:]),
-                dtype=self.dtype,
-            )
-        single_rgb_loader = DataLoader(
-            single_rgb_dataset, batch_size=batch_size, shuffle=False
-        )
-        target_pred_ls = []
-        iterable = single_rgb_loader
-        if show_progress_bar:
-            iterable = tqdm(
-                single_rgb_loader, desc=" " * 2 + "Inference batches", leave=False
-            )
-        for batch in iterable:
-            (batched_img,) = batch
-            target_pred = self.single_infer(
-                rgb_in=batched_img,
-                num_inference_steps=denoising_steps,
-                seed=seed,
-                show_pbar=show_progress_bar,
-            )
-            target_pred = target_pred.detach()
-            if save_memory:
-                target_pred = target_pred.cpu()
-            target_pred_ls.append(target_pred.detach())
-        target_preds = torch.concat(target_pred_ls, dim=0)
-        pred_uncert = None
-        if save_memory:
-            torch.cuda.empty_cache()
-        if ensemble_size > 1:
-            final_pred, pred_uncert = ensemble(
-                target_preds, reduction="median", return_uncertainty=False
-            )
-        else:
-            final_pred = target_preds
-            pred_uncert = None
-        if match_input_res:
-            final_pred = torch.nn.functional.interpolate(
-                final_pred, (H, W), mode="bilinear"  # TODO: parameterize this method
-            )  # [1,3,H,W]
-            if pred_uncert is not None:
-                pred_uncert = torch.nn.functional.interpolate(
-                    pred_uncert.unsqueeze(1), (H, W), mode="bilinear"
-                ).squeeze(
-                    1
-                )  # [1,H,W]
-        # Convert to numpy
-        final_pred = final_pred.squeeze()
-        final_pred = final_pred.cpu().float().numpy()
-        albedo = final_pred[0:3, :, :]
-        material = np.stack(
-            (final_pred[3, :, :], final_pred[4, :, :], final_pred[5, :, :]), axis=0
-        )
-        albedo_colored = (albedo + 1.0) * 0.5
-        albedo_colored = (albedo_colored * 255).astype(np.uint8)
-        albedo_colored = self.chw2hwc(albedo_colored)
-        albedo_colored_img = Image.fromarray(albedo_colored)
-        material_colored = (material + 1.0) * 0.5
-        material_colored = (material_colored * 255).astype(np.uint8)
-        material_colored = self.chw2hwc(material_colored)
-        material_colored_img = Image.fromarray(material_colored)
-        out = MarigoldIIDAppearanceOutput(
-            albedo=albedo,
-            albedo_colored=albedo_colored_img,
-            material=material,
-            material_colored=material_colored_img,
-        )
-        return out
-    def check_inference_step(self, n_step: int):
-        """
-        Check if denoising step is reasonable
-        Args:
-            n_step (`int`): denoising steps
-        """
-        assert n_step >= 1
-        if isinstance(self.scheduler, DDIMScheduler):
-            pass
-        else:
-            raise RuntimeError(f"Unsupported scheduler type: {type(self.scheduler)}")
-    def encode_empty_text(self):
-        """
-        Encode text embedding for empty prompt.
-        """
-        prompt = ""
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="do_not_pad",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids.to(self.text_encoder.device)
-        self.empty_text_embed = self.text_encoder(text_input_ids)[0].to(self.dtype)
-    @torch.no_grad()
-    def single_infer(
-        self,
-        rgb_in: torch.Tensor,
-        num_inference_steps: int,
-        seed: Union[int, None],
-        show_pbar: bool,
-    ) -> torch.Tensor:
-        """
-        Perform an individual iid prediction without ensembling.
-        """
-        device = rgb_in.device
-        # Set timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps  # [T]
-        # Encode image
-        rgb_latent = self.encode_rgb(rgb_in)
-        target_latent_shape = list(rgb_latent.shape)
-        target_latent_shape[
-            1
-        ] *= 2  # TODO: no hardcoding # self.n_targets  # (B, 4*n_targets, h, w)
-        # Initialize prediction latent with noise
-        if seed is None:
-            rand_num_generator = None
-        else:
-            rand_num_generator = torch.Generator(device=device)
-            rand_num_generator.manual_seed(seed)
-        target_latents = torch.randn(
-            target_latent_shape,
-            device=device,
-            dtype=self.dtype,
-            generator=rand_num_generator,
-        )  # [B, 4, h, w]
-        # Batched empty text embedding
-        if self.empty_text_embed is None:
-            self.encode_empty_text()
-        batch_empty_text_embed = self.empty_text_embed.repeat(
-            (rgb_latent.shape[0], 1, 1)
-        )  # [B, 2, 1024]
-        # Denoising loop
-        if show_pbar:
-            iterable = tqdm(
-                enumerate(timesteps),
-                total=len(timesteps),
-                leave=False,
-                desc=" " * 4 + "Diffusion denoising",
-            )
-        else:
-            iterable = enumerate(timesteps)
-        for i, t in iterable:
-            unet_input = torch.cat(
-                [rgb_latent, target_latents], dim=1
-            )  # this order is important
-            # predict the noise residual
-            noise_pred = self.unet(
-                unet_input, t, encoder_hidden_states=batch_empty_text_embed
-            ).sample  # [B, 4, h, w]
-            # compute the previous noisy sample x_t -> x_t-1
-            target_latents = self.scheduler.step(
-                noise_pred, t, target_latents, generator=rand_num_generator
-            ).prev_sample
-        # torch.cuda.empty_cache()  # TODO is it really needed here, even if memory saving?
-        targets = self.decode_targets(target_latents)  # [B, 3, H, W]
-        targets = torch.clip(targets, -1.0, 1.0)
-        return targets
-    def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
-        """
-        Encode RGB image into latent.
-        Args:
-            rgb_in (`torch.Tensor`):
-                Input RGB image to be encoded.
-        Returns:
-            `torch.Tensor`: Image latent.
-        """
-        # encode
-        h = self.vae.encoder(rgb_in)
-        moments = self.vae.quant_conv(h)
-        mean, logvar = torch.chunk(moments, 2, dim=1)
-        # scale latent
-        rgb_latent = mean * self.latent_scale_factor
-        return rgb_latent
-    def decode_targets(self, target_latents: torch.Tensor) -> torch.Tensor:
-        """
-        Decode target latent into target map.
-        Args:
-            target_latents (`torch.Tensor`):
-                Target latent to be decoded.
-        Returns:
-            `torch.Tensor`: Decoded target map.
-        """
-        assert target_latents.shape[1] == 8  # self.n_targets * 4
-        # scale latent
-        target_latents = target_latents / self.latent_scale_factor
-        # decode
-        targets = []
-        for i in range(self.n_targets):
-            latent = target_latents[:, i * 4 : (i + 1) * 4, :, :]
-            z = self.vae.post_quant_conv(latent)
-            stacked = self.vae.decoder(z)
-            targets.append(stacked)
-        return torch.cat(targets, dim=1)
-    @staticmethod
-    def get_pil_resample_method(method_str: str) -> Resampling:
-        resample_method_dic = {
-            "bilinear": Resampling.BILINEAR,
-            "bicubic": Resampling.BICUBIC,
-            "nearest": Resampling.NEAREST,
-        }
-        resample_method = resample_method_dic.get(method_str, None)
-        if resample_method is None:
-            raise ValueError(f"Unknown resampling method: {resample_method}")
-        else:
-            return resample_method
-    @staticmethod
-    def resize_max_res(
-        img: Image.Image, max_edge_resolution: int, resample_method=Resampling.BILINEAR
-    ) -> Image.Image:
-        """
-        Resize image to limit maximum edge length while keeping aspect ratio.
-        """
-        original_width, original_height = img.size
-        downscale_factor = min(
-            max_edge_resolution / original_width, max_edge_resolution / original_height
-        )
-        new_width = int(original_width * downscale_factor)
-        new_height = int(original_height * downscale_factor)
-        resized_img = img.resize((new_width, new_height), resample=resample_method)
-        return resized_img
-    @staticmethod
-    def chw2hwc(chw):
-        assert 3 == len(chw.shape)
-        if isinstance(chw, torch.Tensor):
-            hwc = torch.permute(chw, (1, 2, 0))
-        elif isinstance(chw, np.ndarray):
-            hwc = np.moveaxis(chw, 0, -1)
-        return hwc
-    @staticmethod
-    def find_batch_size(ensemble_size: int, input_res: int, dtype: torch.dtype) -> int:
-        """
-        Automatically search for suitable operating batch size.
-        Args:
-            ensemble_size (`int`):
-                Number of predictions to be ensembled.
-            input_res (`int`):
-                Operating resolution of the input image.
-        Returns:
-            `int`: Operating batch size.
-        """
-        # Search table for suggested max. inference batch size
-        bs_search_table = [
-            # tested on A100-PCIE-80GB
-            {"res": 768, "total_vram": 79, "bs": 35, "dtype": torch.float32},
-            {"res": 1024, "total_vram": 79, "bs": 20, "dtype": torch.float32},
-            # tested on A100-PCIE-40GB
-            {"res": 768, "total_vram": 39, "bs": 15, "dtype": torch.float32},
-            {"res": 1024, "total_vram": 39, "bs": 8, "dtype": torch.float32},
-            {"res": 768, "total_vram": 39, "bs": 30, "dtype": torch.float16},
-            {"res": 1024, "total_vram": 39, "bs": 15, "dtype": torch.float16},
-            # tested on RTX3090, RTX4090
-            {"res": 512, "total_vram": 23, "bs": 20, "dtype": torch.float32},
-            {"res": 768, "total_vram": 23, "bs": 7, "dtype": torch.float32},
-            {"res": 1024, "total_vram": 23, "bs": 3, "dtype": torch.float32},
-            {"res": 512, "total_vram": 23, "bs": 40, "dtype": torch.float16},
-            {"res": 768, "total_vram": 23, "bs": 18, "dtype": torch.float16},
-            {"res": 1024, "total_vram": 23, "bs": 10, "dtype": torch.float16},
-            # tested on GTX1080Ti
-            {"res": 512, "total_vram": 10, "bs": 5, "dtype": torch.float32},
-            {"res": 768, "total_vram": 10, "bs": 2, "dtype": torch.float32},
-            {"res": 512, "total_vram": 10, "bs": 10, "dtype": torch.float16},
-            {"res": 768, "total_vram": 10, "bs": 5, "dtype": torch.float16},
-            {"res": 1024, "total_vram": 10, "bs": 3, "dtype": torch.float16},
-        ]
-        if not torch.cuda.is_available():
-            return 1
-        total_vram = torch.cuda.mem_get_info()[1] / 1024.0**3
-        filtered_bs_search_table = [s for s in bs_search_table if s["dtype"] == dtype]
-        for settings in sorted(
-            filtered_bs_search_table,
-            key=lambda k: (k["res"], -k["total_vram"]),
-        ):
-            if input_res <= settings["res"] and total_vram >= settings["total_vram"]:
-                bs = settings["bs"]
-                if bs > ensemble_size:
-                    bs = ensemble_size
-                elif bs > math.ceil(ensemble_size / 2) and bs < ensemble_size:
-                    bs = math.ceil(ensemble_size / 2)
-                return bs
-        return 1

marigold_iid_lighting.py DELETED Viewed

@@ -1,576 +0,0 @@
-# Copyright 2024 Anton Obukhov, Bingxin Ke & Kevin Qu, ETH Zurich and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# --------------------------------------------------------------------------
-# If you find this code useful, we kindly ask you to cite our paper in your work.
-# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
-# More information about the method can be found at https://marigoldcomputervision.github.io
-# --------------------------------------------------------------------------
-import logging
-import math
-from typing import Optional, Tuple, Union, Dict, Any
-import numpy as np
-import torch
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DiffusionPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils import BaseOutput, check_min_version
-from PIL import Image
-from PIL.Image import Resampling
-from torch.utils.data import DataLoader, TensorDataset
-from tqdm.auto import tqdm
-from transformers import CLIPTextModel, CLIPTokenizer
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.27.0.dev0")
-class MarigoldIIDLightingOutput(BaseOutput):
-    """
-    Output class for Marigold-IID-Lighting pipeline.
-    Args:
-        albedo (`np.ndarray`):
-            Predicted albedo map with the shape of [3, H, W] values in the range of [0, 1].
-        albedo_colored (`PIL.Image.Image`):
-            Colorized albedo map with the shape of [H, W, 3].
-        shading (`np.ndarray`):
-            Predicted diffuse shading map with the shape of [3, H, W] values in the range of [0, 1].
-        shading_colored (`PIL.Image.Image`):
-            Colorized diffuse shading map with the shape of [H, W, 3].
-        residual (`np.ndarray`):
-            Predicted non-diffuse residual map with the shape of [3, H, W] values in the range of [0, 1].
-        residual_colored (`PIL.Image.Image`):
-            Colorized non-diffuse residual map with the shape of [H, W, 3].
-    """
-    albedo: np.ndarray
-    albedo_colored: Image.Image
-    shading: np.ndarray
-    shading_colored: Image.Image
-    residual: np.ndarray
-    residual_colored: Image.Image
-class MarigoldIIDLightingPipeline(DiffusionPipeline):
-    """
-    Pipeline for Intrinsic Image Decomposition (Albedo, diffuse shading and non-diffuse residual) using Marigold: https://marigoldcomputervision.github.io.
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-    Args:
-        unet (`UNet2DConditionModel`):
-            Conditional U-Net to denoise the normals latent, conditioned on image latent.
-        vae (`AutoencoderKL`):
-            Variational Auto-Encoder (VAE) Model to encode and decode images and normals maps
-            to and from latent representations.
-        scheduler (`DDIMScheduler`):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
-        text_encoder (`CLIPTextModel`):
-            Text-encoder, for empty text embedding.
-        tokenizer (`CLIPTokenizer`):
-            CLIP tokenizer.
-    """
-    latent_scale_factor = 0.18215
-    def __init__(
-        self,
-        unet: UNet2DConditionModel,
-        vae: AutoencoderKL,
-        scheduler: DDIMScheduler,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        prediction_type: Optional[str] = None,
-        target_properties: Optional[Dict[str, Any]] = None,
-        default_denoising_steps: Optional[int] = None,
-        default_processing_resolution: Optional[int] = None,
-    ):
-        super().__init__()
-        self.register_modules(
-            unet=unet,
-            vae=vae,
-            scheduler=scheduler,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-        )
-        self.register_to_config(
-            prediction_type=prediction_type,
-            target_properties=target_properties,
-            default_denoising_steps=default_denoising_steps,
-            default_processing_resolution=default_processing_resolution,
-        )
-        self.empty_text_embed = None
-        self.n_targets = 3  # Albedo, shading, residual
-    @torch.no_grad()
-    def __call__(
-        self,
-        input_image: Image,
-        denoising_steps: int = 4,
-        ensemble_size: int = 10,
-        processing_res: int = 768,
-        match_input_res: bool = True,
-        resample_method: str = "bilinear",
-        batch_size: int = 0,
-        save_memory: bool = False,
-        seed: Union[int, None] = None,
-        color_map: str = "Spectral",  # TODO change colorization api based on modality
-        show_progress_bar: bool = True,
-        **kwargs,
-    ) -> MarigoldIIDLightingOutput:
-        """
-        Function invoked when calling the pipeline.
-        Args:
-            input_image (`Image`):
-                Input RGB (or gray-scale) image.
-            denoising_steps (`int`, *optional*, defaults to `10`):
-                Number of diffusion denoising steps (DDIM) during inference.
-            ensemble_size (`int`, *optional*, defaults to `10`):
-                Number of predictions to be ensembled.
-            processing_res (`int`, *optional*, defaults to `768`):
-                Maximum resolution of processing.
-                If set to 0: will not resize at all.
-            match_input_res (`bool`, *optional*, defaults to `True`):
-                Resize normals prediction to match input resolution.
-                Only valid if `limit_input_res` is not None.
-            resample_method: (`str`, *optional*, defaults to `bilinear`):
-                Resampling method used to resize images and depth predictions. This can be one of `bilinear`, `bicubic` or `nearest`, defaults to: `bilinear`.
-            batch_size (`int`, *optional*, defaults to `0`):
-                Inference batch size, no bigger than `num_ensemble`.
-                If set to 0, the script will automatically decide the proper batch size.
-            save_memory (`bool`, defaults to `False`):
-                Extra steps to save memory at the cost of perforance.
-            seed (`int`, *optional*, defaults to `None`)
-                Reproducibility seed.
-            color_map (`str`, *optional*, defaults to `"Spectral"`, pass `None` to skip colorized normals map generation):
-                Colormap used to colorize the normals map.
-            show_progress_bar (`bool`, *optional*, defaults to `True`):
-                Display a progress bar of diffusion denoising.
-        Returns:
-            `MarigoldIIDLightingOutput`: Output class for Marigold monocular intrinsic image decomposition (lighting) prediction pipeline, including:
-            - **albedo** (`np.ndarray`) Predicted albedo map with the shape of [3, H, W] values in the range of [0, 1]
-            - **albedo_colored** (`PIL.Image.Image`) Colorized albedo map with the shape of [3, H, W] values in the range of [0, 1]
-            - **material** (`np.ndarray`) Predicted material map with the shape of [3, H, W] and values in [0, 1]
-            - **material_colored** (`PIL.Image.Image`) Colorized material map with the shape of [3, H, W] and values in [0, 1]
-        """
-        if not match_input_res:
-            assert processing_res is not None
-        assert processing_res >= 0
-        assert denoising_steps >= 1
-        assert ensemble_size >= 1
-        # Check if denoising step is reasonable
-        self.check_inference_step(denoising_steps)
-        resample_method: Resampling = self.get_pil_resample_method(resample_method)
-        W, H = input_image.size
-        if processing_res > 0:
-            input_image = self.resize_max_res(
-                input_image,
-                max_edge_resolution=processing_res,
-                resample_method=resample_method,
-            )
-        input_image = input_image.convert("RGB")
-        image = np.asarray(input_image)
-        rgb = np.transpose(image, (2, 0, 1))  # [H, W, rgb] -> [rgb, H, W]
-        rgb_norm = rgb / 255.0 * 2.0 - 1.0  #  [0, 255] -> [-1, 1]
-        rgb_norm = torch.from_numpy(rgb_norm).to(self.dtype)
-        rgb_norm = rgb_norm.to(self.device)
-        assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0  # TODO remove this
-        def ensemble(
-            targets: torch.Tensor,
-            return_uncertainty: bool = False,
-            reduction="median",
-        ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-            uncertainty = None
-            if reduction == "mean":
-                prediction = torch.mean(targets, dim=0, keepdim=True)
-                if return_uncertainty:
-                    uncertainty = torch.std(targets, dim=0, keepdim=True)
-            elif reduction == "median":
-                prediction = torch.median(targets, dim=0, keepdim=True).values
-                if return_uncertainty:
-                    uncertainty = torch.median(
-                        torch.abs(targets - prediction), dim=0, keepdim=True
-                    ).values
-            else:
-                raise ValueError(f"Unrecognized reduction method: {reduction}.")
-            return prediction, uncertainty
-        duplicated_rgb = torch.stack([rgb_norm] * ensemble_size)
-        single_rgb_dataset = TensorDataset(duplicated_rgb)
-        if batch_size <= 0:
-            batch_size = self.find_batch_size(
-                ensemble_size=ensemble_size,
-                input_res=max(rgb_norm.shape[1:]),
-                dtype=self.dtype,
-            )
-        single_rgb_loader = DataLoader(
-            single_rgb_dataset, batch_size=batch_size, shuffle=False
-        )
-        target_pred_ls = []
-        iterable = single_rgb_loader
-        if show_progress_bar:
-            iterable = tqdm(
-                single_rgb_loader, desc=" " * 2 + "Inference batches", leave=False
-            )
-        for batch in iterable:
-            (batched_img,) = batch
-            target_pred = self.single_infer(
-                rgb_in=batched_img,
-                num_inference_steps=denoising_steps,
-                seed=seed,
-                show_pbar=show_progress_bar,
-            )
-            target_pred = target_pred.detach()
-            if save_memory:
-                target_pred = target_pred.cpu()
-            target_pred_ls.append(target_pred.detach())
-        target_preds = torch.concat(target_pred_ls, dim=0)
-        pred_uncert = None
-        if save_memory:
-            torch.cuda.empty_cache()
-        if ensemble_size > 1:
-            final_pred, pred_uncert = ensemble(
-                target_preds, reduction="median", return_uncertainty=False
-            )
-        else:
-            final_pred = target_preds
-            pred_uncert = None
-        if match_input_res:
-            final_pred = torch.nn.functional.interpolate(
-                final_pred, (H, W), mode="bilinear"  # TODO: parameterize this method
-            )  # [1,3,H,W]
-            if pred_uncert is not None:
-                pred_uncert = torch.nn.functional.interpolate(
-                    pred_uncert.unsqueeze(1), (H, W), mode="bilinear"
-                ).squeeze(
-                    1
-                )  # [1,H,W]
-        # Convert to numpy
-        final_pred = final_pred.squeeze()
-        final_pred = final_pred.cpu().float().numpy()
-        albedo = final_pred[0:3, :, :]
-        shading = final_pred[3:6, :, :]
-        residual = final_pred[6:, :, :]
-        albedo_colored = (albedo + 1.0) * 0.5  # [-1,1] -> [0,1]
-        albedo_colored = albedo_colored ** (
-            1 / 2.2
-        )  # from linear to sRGB (to be consistent with IID-Appearance model)
-        albedo_colored = (albedo_colored * 255).astype(np.uint8)
-        albedo_colored = self.chw2hwc(albedo_colored)
-        albedo_colored_img = Image.fromarray(albedo_colored)
-        shading_colored = (shading + 1.0) * 0.5
-        shading_colored = (
-            shading_colored / shading_colored.max()
-        )  # rescale for better visualization
-        shading_colored = (shading_colored * 255).astype(np.uint8)
-        shading_colored = self.chw2hwc(shading_colored)
-        shading_colored_img = Image.fromarray(shading_colored)
-        residual_colored = (residual + 1.0) * 0.5
-        residual_colored = (
-            residual_colored / residual_colored.max()
-        )  # rescale for better visualization
-        residual_colored = (residual_colored * 255).astype(np.uint8)
-        residual_colored = self.chw2hwc(residual_colored)
-        residual_colored_img = Image.fromarray(residual_colored)
-        out = MarigoldIIDLightingOutput(
-            albedo=albedo,
-            albedo_colored=albedo_colored_img,
-            shading=shading,
-            shading_colored=shading_colored_img,
-            residual=residual,
-            residual_colored=residual_colored_img,
-        )
-        return out
-    def check_inference_step(self, n_step: int):
-        """
-        Check if denoising step is reasonable
-        Args:
-            n_step (`int`): denoising steps
-        """
-        assert n_step >= 1
-        if isinstance(self.scheduler, DDIMScheduler):
-            pass
-        else:
-            raise RuntimeError(f"Unsupported scheduler type: {type(self.scheduler)}")
-    def encode_empty_text(self):
-        """
-        Encode text embedding for empty prompt.
-        """
-        prompt = ""
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="do_not_pad",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids.to(self.text_encoder.device)
-        self.empty_text_embed = self.text_encoder(text_input_ids)[0].to(self.dtype)
-    @torch.no_grad()
-    def single_infer(
-        self,
-        rgb_in: torch.Tensor,
-        num_inference_steps: int,
-        seed: Union[int, None],
-        show_pbar: bool,
-    ) -> torch.Tensor:
-        """
-        Perform an individual iid prediction without ensembling.
-        """
-        device = rgb_in.device
-        # Set timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps  # [T]
-        # Encode image
-        rgb_latent = self.encode_rgb(rgb_in)
-        target_latent_shape = list(rgb_latent.shape)
-        target_latent_shape[
-            1
-        ] *= 3  # TODO: no hardcoding # self.n_targets  # (B, 4*n_targets, h, w)
-        # Initialize prediction latent with noise
-        if seed is None:
-            rand_num_generator = None
-        else:
-            rand_num_generator = torch.Generator(device=device)
-            rand_num_generator.manual_seed(seed)
-        target_latents = torch.randn(
-            target_latent_shape,
-            device=device,
-            dtype=self.dtype,
-            generator=rand_num_generator,
-        )  # [B, 4, h, w]
-        # Batched empty text embedding
-        if self.empty_text_embed is None:
-            self.encode_empty_text()
-        batch_empty_text_embed = self.empty_text_embed.repeat(
-            (rgb_latent.shape[0], 1, 1)
-        )  # [B, 2, 1024]
-        # Denoising loop
-        if show_pbar:
-            iterable = tqdm(
-                enumerate(timesteps),
-                total=len(timesteps),
-                leave=False,
-                desc=" " * 4 + "Diffusion denoising",
-            )
-        else:
-            iterable = enumerate(timesteps)
-        for i, t in iterable:
-            unet_input = torch.cat(
-                [rgb_latent, target_latents], dim=1
-            )  # this order is important
-            # predict the noise residual
-            noise_pred = self.unet(
-                unet_input, t, encoder_hidden_states=batch_empty_text_embed
-            ).sample  # [B, 4, h, w]
-            # compute the previous noisy sample x_t -> x_t-1
-            target_latents = self.scheduler.step(
-                noise_pred, t, target_latents, generator=rand_num_generator
-            ).prev_sample
-        # torch.cuda.empty_cache()  # TODO is it really needed here, even if memory saving?
-        targets = self.decode_targets(target_latents)  # [B, 3, H, W]
-        targets = torch.clip(targets, -1.0, 1.0)
-        return targets
-    def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
-        """
-        Encode RGB image into latent.
-        Args:
-            rgb_in (`torch.Tensor`):
-                Input RGB image to be encoded.
-        Returns:
-            `torch.Tensor`: Image latent.
-        """
-        # encode
-        h = self.vae.encoder(rgb_in)
-        moments = self.vae.quant_conv(h)
-        mean, logvar = torch.chunk(moments, 2, dim=1)
-        # scale latent
-        rgb_latent = mean * self.latent_scale_factor
-        return rgb_latent
-    def decode_targets(self, target_latents: torch.Tensor) -> torch.Tensor:
-        """
-        Decode target latent into target map.
-        Args:
-            target_latents (`torch.Tensor`):
-                Target latent to be decoded.
-        Returns:
-            `torch.Tensor`: Decoded target map.
-        """
-        assert target_latents.shape[1] == 12  # self.n_targets * 4
-        # scale latent
-        target_latents = target_latents / self.latent_scale_factor
-        # decode
-        targets = []
-        for i in range(self.n_targets):
-            latent = target_latents[:, i * 4 : (i + 1) * 4, :, :]
-            z = self.vae.post_quant_conv(latent)
-            stacked = self.vae.decoder(z)
-            targets.append(stacked)
-        return torch.cat(targets, dim=1)
-    @staticmethod
-    def get_pil_resample_method(method_str: str) -> Resampling:
-        resample_method_dic = {
-            "bilinear": Resampling.BILINEAR,
-            "bicubic": Resampling.BICUBIC,
-            "nearest": Resampling.NEAREST,
-        }
-        resample_method = resample_method_dic.get(method_str, None)
-        if resample_method is None:
-            raise ValueError(f"Unknown resampling method: {resample_method}")
-        else:
-            return resample_method
-    @staticmethod
-    def resize_max_res(
-        img: Image.Image, max_edge_resolution: int, resample_method=Resampling.BILINEAR
-    ) -> Image.Image:
-        """
-        Resize image to limit maximum edge length while keeping aspect ratio.
-        """
-        original_width, original_height = img.size
-        downscale_factor = min(
-            max_edge_resolution / original_width, max_edge_resolution / original_height
-        )
-        new_width = int(original_width * downscale_factor)
-        new_height = int(original_height * downscale_factor)
-        resized_img = img.resize((new_width, new_height), resample=resample_method)
-        return resized_img
-    @staticmethod
-    def chw2hwc(chw):
-        assert 3 == len(chw.shape)
-        if isinstance(chw, torch.Tensor):
-            hwc = torch.permute(chw, (1, 2, 0))
-        elif isinstance(chw, np.ndarray):
-            hwc = np.moveaxis(chw, 0, -1)
-        return hwc
-    @staticmethod
-    def find_batch_size(ensemble_size: int, input_res: int, dtype: torch.dtype) -> int:
-        """
-        Automatically search for suitable operating batch size.
-        Args:
-            ensemble_size (`int`):
-                Number of predictions to be ensembled.
-            input_res (`int`):
-                Operating resolution of the input image.
-        Returns:
-            `int`: Operating batch size.
-        """
-        # Search table for suggested max. inference batch size
-        bs_search_table = [
-            # tested on A100-PCIE-80GB
-            {"res": 768, "total_vram": 79, "bs": 35, "dtype": torch.float32},
-            {"res": 1024, "total_vram": 79, "bs": 20, "dtype": torch.float32},
-            # tested on A100-PCIE-40GB
-            {"res": 768, "total_vram": 39, "bs": 15, "dtype": torch.float32},
-            {"res": 1024, "total_vram": 39, "bs": 8, "dtype": torch.float32},
-            {"res": 768, "total_vram": 39, "bs": 30, "dtype": torch.float16},
-            {"res": 1024, "total_vram": 39, "bs": 15, "dtype": torch.float16},
-            # tested on RTX3090, RTX4090
-            {"res": 512, "total_vram": 23, "bs": 20, "dtype": torch.float32},
-            {"res": 768, "total_vram": 23, "bs": 7, "dtype": torch.float32},
-            {"res": 1024, "total_vram": 23, "bs": 3, "dtype": torch.float32},
-            {"res": 512, "total_vram": 23, "bs": 40, "dtype": torch.float16},
-            {"res": 768, "total_vram": 23, "bs": 18, "dtype": torch.float16},
-            {"res": 1024, "total_vram": 23, "bs": 10, "dtype": torch.float16},
-            # tested on GTX1080Ti
-            {"res": 512, "total_vram": 10, "bs": 5, "dtype": torch.float32},
-            {"res": 768, "total_vram": 10, "bs": 2, "dtype": torch.float32},
-            {"res": 512, "total_vram": 10, "bs": 10, "dtype": torch.float16},
-            {"res": 768, "total_vram": 10, "bs": 5, "dtype": torch.float16},
-            {"res": 1024, "total_vram": 10, "bs": 3, "dtype": torch.float16},
-        ]
-        if not torch.cuda.is_available():
-            return 1
-        total_vram = torch.cuda.mem_get_info()[1] / 1024.0**3
-        filtered_bs_search_table = [s for s in bs_search_table if s["dtype"] == dtype]
-        for settings in sorted(
-            filtered_bs_search_table,
-            key=lambda k: (k["res"], -k["total_vram"]),
-        ):
-            if input_res <= settings["res"] and total_vram >= settings["total_vram"]:
-                bs = settings["bs"]
-                if bs > ensemble_size:
-                    bs = ensemble_size
-                elif bs > math.ceil(ensemble_size / 2) and bs < ensemble_size:
-                    bs = math.ceil(ensemble_size / 2)
-                return bs
-        return 1

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-diffusers>=0.32.2
 git+https://github.com/toshas/gradio-dualvision.git@21346a4
 accelerate
 huggingface_hub

+diffusers>=0.33.0
 git+https://github.com/toshas/gradio-dualvision.git@21346a4
 accelerate
 huggingface_hub