ShinoharaHare commited on
Commit
c191d3a
·
verified ·
1 Parent(s): 43ee822

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ images/miku/decensored.png filter=lfs diff=lfs merge=lfs -text
37
+ images/mermaid/decensored.png filter=lfs diff=lfs merge=lfs -text
38
+ images/megumin/decensored.png filter=lfs diff=lfs merge=lfs -text
39
+ images/megumin/censored.png filter=lfs diff=lfs merge=lfs -text
40
+ images/miku/censored.png filter=lfs diff=lfs merge=lfs -text
41
+ images/mermaid/censored.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ tags:
5
+ - text-to-image
6
+ - image-to-image
7
+ - safetensors
8
+ - stable-diffusion
9
+ - stable-diffusion-xl
10
+ license: openrail++
11
+ pipeline_tag: image-to-image
12
+ library_name: diffusers
13
+ ---
14
+
15
+ # Waifu Decensor XL
16
+
17
+ | Censored | Mask | Decensored |
18
+ |:-----------------------------------------------------------------:|:-------------------------------------------------------------:|:-------------------------------------------------------------------:|
19
+ | <img src="images/mermaid/censored.png" style="max-width: 210px;"> | <img src="images/mermaid/mask.png" style="max-width: 210px;"> | <img src="images/mermaid/decensored.png" style="max-width: 210px;"> |
20
+ | <img src="images/miku/censored.png" style="max-width: 210px;"> | <img src="images/miku/mask.png" style="max-width: 210px;"> | <img src="images/miku/decensored.png" style="max-width: 210px;"> |
21
+ | <img src="images/megumin/censored.png" style="max-width: 210px;"> | <img src="images/megumin/mask.png" style="max-width: 210px;"> | <img src="images/megumin/decensored.png" style="max-width: 210px;"> |
22
+
23
+ ## Overview
24
+
25
+ **\[Experimental\] This model is in an early experimental stage. Its performance may not be reliable for all use cases, and results can vary depending on the input image and mask.**
26
+
27
+ **Waifu Decensor XL** is a specialized Stable Diffusion XL (SDXL) model designed for inpainting, particularly for "de-censoring" or removing mosaic effects from anime-style illustrations.
28
+
29
+ This model was fine-tuned from [**WAI-NSFW-illustrious-SDXL V14.0**](https://civitai.com/models/827184/wai-nsfw-illustrious-sdxl). Its UNet has been modified to accept 5 additional input channels: 1 for the mask and 4 for the latents of the censored image.
30
+
31
+ A key feature is the mask's value range:
32
+ - **0.0 (Black)**: The area remains unchanged.
33
+ - **0.5 (Gray)**: The area is "de-mosaiced," using the censored information as a guide.
34
+ - **1.0 (White)**: The area is inpainted from scratch.
35
+
36
+ ## 🧨 Diffusers
37
+
38
+ ```python
39
+ import torch
40
+ from diffusers import DiffusionPipeline
41
+ from diffusers.utils import load_image
42
+
43
+ pipeline = DiffusionPipeline.from_pretrained(
44
+ 'ShinoharaHare/Waifu-Decensor-XL',
45
+ torch_dtype=torch.bfloat16,
46
+ use_flash_attention=True # optional
47
+ )
48
+ pipeline.to('cuda')
49
+
50
+ censored = load_image(...)
51
+ mask = load_image(...)
52
+
53
+ decensored = pipeline(
54
+ censored,
55
+ mask_image=mask,
56
+ width=censored.width,
57
+ height=censored.height,
58
+ strength=1.0,
59
+ num_inference_steps=28,
60
+ ensure_image_consistency=True, # It's recommended to enable this option to ensure that the unmasked areas remain unchanged and to reduce color discrepancies.
61
+ dilate_latent_mask=5 # If any you haven’t left extra space around the mask, make sure to set this parameter. Otherwise, you may notice significant degradation along the edges.
62
+ ).images[0]
63
+
64
+ censored.show()
65
+ ```
66
+
67
+ ## Limitations
68
+
69
+ - **Color Discrepancies**: The model may struggle with color consistency in two ways:
70
+ 1. **Global Mismatch**: Without `ensure_image_consistency=True`, you may notice a significant color shift across the **entire image**. Enabling this option is crucial as it locks the unmasked areas to their original colors. However, even with this enabled, the newly generated area can still have a color tone that does not perfectly match its surrounding context.
71
+ 2. **Local Inconsistency**: For demosaicing tasks, the model is designed to infer the original colors from the color palette of the censored block itself. However, it may sometimes fail to follow this hint, resulting in colors that deviate from what the censored area suggests.
72
+
73
+ - **Large-area inpainting**: Inpainting very large regions may result in lower-quality or less coherent outputs.
74
+
75
+ **If you have suggestions on how to improve above issues, please share them!**
attention_processor.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from diffusers.models.attention import Attention
3
+
4
+ try:
5
+ from flash_attn import flash_attn_func
6
+ except ImportError:
7
+ flash_attn_func = None
8
+
9
+
10
+ class AttnProcessorFA2:
11
+ def __init__(self):
12
+ if flash_attn_func is None:
13
+ raise ImportError(
14
+ "`flash_attn_func` is not available. Please install flash-attention with `pip install flash-attn --no-build-isolation`."
15
+ )
16
+
17
+ def __call__(
18
+ self,
19
+ attn: Attention,
20
+ hidden_states: torch.Tensor,
21
+ encoder_hidden_states: torch.Tensor | None = None,
22
+ attention_mask: torch.Tensor | None = None,
23
+ temb: torch.Tensor | None = None
24
+ ) -> torch.Tensor:
25
+ residual = hidden_states
26
+ if attn.spatial_norm is not None:
27
+ hidden_states = attn.spatial_norm(hidden_states, temb)
28
+
29
+ input_ndim = hidden_states.ndim
30
+
31
+ if input_ndim == 4:
32
+ batch_size, channel, height, width = hidden_states.shape
33
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
34
+
35
+ batch_size, sequence_length, _ = (
36
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
37
+ )
38
+
39
+ if attention_mask is not None:
40
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
41
+ # scaled_dot_product_attention expects attention_mask shape to be
42
+ # (batch, heads, source_length, target_length)
43
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
44
+
45
+ if attn.group_norm is not None:
46
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
47
+
48
+ query = attn.to_q(hidden_states)
49
+
50
+ if encoder_hidden_states is None:
51
+ encoder_hidden_states = hidden_states
52
+ elif attn.norm_cross:
53
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
54
+
55
+ key = attn.to_k(encoder_hidden_states)
56
+ value = attn.to_v(encoder_hidden_states)
57
+
58
+ inner_dim = key.shape[-1]
59
+ head_dim = inner_dim // attn.heads
60
+
61
+ query = query.view(batch_size, -1, attn.heads, head_dim)
62
+ key = key.view(batch_size, -1, attn.heads, head_dim)
63
+ value = value.view(batch_size, -1, attn.heads, head_dim)
64
+
65
+ if attn.norm_q is not None:
66
+ query = attn.norm_q(query)
67
+ if attn.norm_k is not None:
68
+ key = attn.norm_k(key)
69
+
70
+ hidden_states = flash_attn_func(
71
+ query,
72
+ key,
73
+ value
74
+ )
75
+
76
+ hidden_states = hidden_states.reshape(batch_size, -1, attn.heads * head_dim)
77
+ hidden_states = hidden_states.to(query.dtype)
78
+
79
+ # linear proj
80
+ hidden_states = attn.to_out[0](hidden_states)
81
+ # dropout
82
+ hidden_states = attn.to_out[1](hidden_states)
83
+
84
+ if input_ndim == 4:
85
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
86
+
87
+ if attn.residual_connection:
88
+ hidden_states = hidden_states + residual
89
+
90
+ hidden_states = hidden_states / attn.rescale_output_factor
91
+
92
+ return hidden_states
images/megumin/censored.png ADDED

Git LFS Details

  • SHA256: 8b9b3df8643bc65982c031a979eb3088fc4d95a2f345e6e737264b45d1e62d21
  • Pointer size: 131 Bytes
  • Size of remote file: 982 kB
images/megumin/decensored.png ADDED

Git LFS Details

  • SHA256: acf44b8fbd5c0c690137e922b9f1a85f844cd0bb9b980b102e2c0252ecd2c62c
  • Pointer size: 132 Bytes
  • Size of remote file: 1.28 MB
images/megumin/mask.png ADDED
images/mermaid/censored.png ADDED

Git LFS Details

  • SHA256: d5aeb8e34b8acdd1e09b5cc8187289b5ec1fe7f7fc28c77b7141bee59bfccf4d
  • Pointer size: 131 Bytes
  • Size of remote file: 783 kB
images/mermaid/decensored.png ADDED

Git LFS Details

  • SHA256: 58f65aa6f40fafed6a0e74f7a896f9292432def61ef3036ff84efd7d3710f1f9
  • Pointer size: 131 Bytes
  • Size of remote file: 839 kB
images/mermaid/mask.png ADDED
images/miku/censored.png ADDED

Git LFS Details

  • SHA256: 65a48878d06d18238b7f707d533baddd90c91a9738fa694d987f532ce2705cef
  • Pointer size: 132 Bytes
  • Size of remote file: 1.25 MB
images/miku/decensored.png ADDED

Git LFS Details

  • SHA256: 1f6327e08ed2933bb08efd8382b6693cef0c395cbd9d0e255ae36f861f9f52b2
  • Pointer size: 132 Bytes
  • Size of remote file: 1.35 MB
images/miku/mask.png ADDED
model_index.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": [
3
+ "sdxl_decensor_pipeline",
4
+ "SDXLDecensorPipeline"
5
+ ],
6
+ "_diffusers_version": "0.33.1",
7
+ "_name_or_path": "checkpoints/WAI-NSFW-illustrious-SDXL-V14.0",
8
+ "feature_extractor": [
9
+ null,
10
+ null
11
+ ],
12
+ "force_zeros_for_empty_prompt": true,
13
+ "image_encoder": [
14
+ null,
15
+ null
16
+ ],
17
+ "requires_aesthetics_score": false,
18
+ "scheduler": [
19
+ "diffusers",
20
+ "EulerDiscreteScheduler"
21
+ ],
22
+ "text_encoder": [
23
+ "transformers",
24
+ "CLIPTextModel"
25
+ ],
26
+ "text_encoder_2": [
27
+ "transformers",
28
+ "CLIPTextModelWithProjection"
29
+ ],
30
+ "tokenizer": [
31
+ "transformers",
32
+ "CLIPTokenizer"
33
+ ],
34
+ "tokenizer_2": [
35
+ "transformers",
36
+ "CLIPTokenizer"
37
+ ],
38
+ "unet": [
39
+ "diffusers",
40
+ "UNet2DConditionModel"
41
+ ],
42
+ "vae": [
43
+ "diffusers",
44
+ "AutoencoderKL"
45
+ ]
46
+ }
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "EulerDiscreteScheduler",
3
+ "_diffusers_version": "0.33.1",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "interpolation_type": "linear",
9
+ "num_train_timesteps": 1000,
10
+ "prediction_type": "epsilon",
11
+ "rescale_betas_zero_snr": false,
12
+ "sample_max_value": 1.0,
13
+ "set_alpha_to_one": false,
14
+ "skip_prk_steps": true,
15
+ "steps_offset": 1,
16
+ "timestep_spacing": "leading",
17
+ "trained_betas": null,
18
+ "use_karras_sigmas": false
19
+ }
sdxl_decensor_pipeline.py ADDED
@@ -0,0 +1,939 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable
2
+
3
+ import torch.nn.functional as F
4
+ from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint import *
5
+
6
+ from .attention_processor import AttnProcessorFA2
7
+
8
+
9
+ class SDXLDecensorPipeline(StableDiffusionXLInpaintPipeline):
10
+ vae: AutoencoderKL
11
+ text_encoder: CLIPTextModel
12
+ text_encoder_2: CLIPTextModelWithProjection
13
+ tokenizer: CLIPTokenizer
14
+ tokenizer_2: CLIPTokenizer
15
+ unet: UNet2DConditionModel
16
+
17
+ def __init__(
18
+ self,
19
+ vae: AutoencoderKL,
20
+ text_encoder: CLIPTextModel,
21
+ text_encoder_2: CLIPTextModelWithProjection,
22
+ tokenizer: CLIPTokenizer,
23
+ tokenizer_2: CLIPTokenizer,
24
+ unet: UNet2DConditionModel,
25
+ scheduler: KarrasDiffusionSchedulers,
26
+ image_encoder: CLIPVisionModelWithProjection = None,
27
+ feature_extractor: CLIPImageProcessor = None,
28
+ requires_aesthetics_score: bool = False,
29
+ force_zeros_for_empty_prompt: bool = True,
30
+ add_watermarker: Optional[bool] = None,
31
+ use_flash_attention: bool = False,
32
+ suppress_tokenizer_warning: bool = False
33
+ ) -> None:
34
+ super(StableDiffusionXLInpaintPipeline).__init__()
35
+
36
+ self.register_modules(
37
+ vae=vae,
38
+ text_encoder=text_encoder,
39
+ text_encoder_2=text_encoder_2,
40
+ tokenizer=tokenizer,
41
+ tokenizer_2=tokenizer_2,
42
+ unet=unet,
43
+ image_encoder=image_encoder,
44
+ feature_extractor=feature_extractor,
45
+ scheduler=scheduler,
46
+ )
47
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
48
+ self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
49
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
50
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
51
+ self.mask_processor = VaeImageProcessor(
52
+ vae_scale_factor=self.vae_scale_factor,
53
+ do_normalize=False,
54
+ do_binarize=False,
55
+ do_convert_grayscale=True
56
+ )
57
+
58
+ add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
59
+
60
+ if add_watermarker:
61
+ self.watermark = StableDiffusionXLWatermarker()
62
+ else:
63
+ self.watermark = None
64
+
65
+ if use_flash_attention:
66
+ self.unet.set_attn_processor(AttnProcessorFA2())
67
+
68
+ self.set_progress_bar_config(leave=False)
69
+
70
+ if suppress_tokenizer_warning:
71
+ self.tokenizer.deprecation_warnings['sequence-length-is-longer-than-the-specified-maximum'] = True
72
+ self.tokenizer_2.deprecation_warnings['sequence-length-is-longer-than-the-specified-maximum'] = True
73
+
74
+ def check_inputs(
75
+ self,
76
+ prompt,
77
+ prompt_2,
78
+ image,
79
+ mask_image,
80
+ height,
81
+ width,
82
+ strength,
83
+ callback_steps,
84
+ output_type,
85
+ negative_prompt=None,
86
+ negative_prompt_2=None,
87
+ prompt_embeds=None,
88
+ negative_prompt_embeds=None,
89
+ ip_adapter_image=None,
90
+ ip_adapter_image_embeds=None,
91
+ callback_on_step_end_tensor_inputs=None,
92
+ padding_mask_crop=None,
93
+ ):
94
+ if strength < 0 or strength > 1:
95
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
96
+
97
+ if height % 8 != 0 or width % 8 != 0:
98
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
99
+
100
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
101
+ raise ValueError(
102
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
103
+ f" {type(callback_steps)}."
104
+ )
105
+
106
+ if callback_on_step_end_tensor_inputs is not None and not all(
107
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
108
+ ):
109
+ raise ValueError(
110
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
111
+ )
112
+
113
+ if prompt is not None and prompt_embeds is not None:
114
+ raise ValueError(
115
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
116
+ " only forward one of the two."
117
+ )
118
+ elif prompt_2 is not None and prompt_embeds is not None:
119
+ raise ValueError(
120
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
121
+ " only forward one of the two."
122
+ )
123
+ elif prompt is None and prompt_embeds is None:
124
+ raise ValueError(
125
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
126
+ )
127
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
128
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
129
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
130
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
131
+
132
+ if negative_prompt is not None and negative_prompt_embeds is not None:
133
+ raise ValueError(
134
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
135
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
136
+ )
137
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
138
+ raise ValueError(
139
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
140
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
141
+ )
142
+
143
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
144
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
145
+ raise ValueError(
146
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
147
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
148
+ f" {negative_prompt_embeds.shape}."
149
+ )
150
+ if padding_mask_crop is not None:
151
+ if not isinstance(image, PIL.Image.Image):
152
+ raise ValueError(
153
+ f"The image should be a PIL image when inpainting mask crop, but is of type {type(image)}."
154
+ )
155
+ if not isinstance(mask_image, PIL.Image.Image):
156
+ raise ValueError(
157
+ f"The mask image should be a PIL image when inpainting mask crop, but is of type"
158
+ f" {type(mask_image)}."
159
+ )
160
+ if output_type != "pil":
161
+ raise ValueError(f"The output type should be PIL when inpainting mask crop, but is {output_type}.")
162
+
163
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
164
+ raise ValueError(
165
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
166
+ )
167
+
168
+ if ip_adapter_image_embeds is not None:
169
+ if not isinstance(ip_adapter_image_embeds, list):
170
+ raise ValueError(
171
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
172
+ )
173
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
174
+ raise ValueError(
175
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
176
+ )
177
+
178
+ def prepare_mask_latents(
179
+ self,
180
+ mask,
181
+ masked_image,
182
+ batch_size,
183
+ height,
184
+ width,
185
+ dtype,
186
+ device,
187
+ generator,
188
+ do_classifier_free_guidance,
189
+ dilate_latent_mask: int | None
190
+ ):
191
+ # resize the mask to latents shape as we concatenate the mask to the latents
192
+ # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
193
+ # and half precision
194
+ mask = torch.nn.functional.interpolate(
195
+ mask,
196
+ size=(height // self.vae_scale_factor, width // self.vae_scale_factor),
197
+ mode='nearest-exact'
198
+ )
199
+
200
+ if dilate_latent_mask is not None:
201
+ kernel_size = dilate_latent_mask
202
+ kernel_size += 1 - kernel_size % 2
203
+ mask = F.max_pool2d(mask, kernel_size, stride=1, padding=kernel_size // 2)
204
+
205
+ mask = mask.to(device=device, dtype=dtype)
206
+
207
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
208
+ if mask.shape[0] < batch_size:
209
+ if not batch_size % mask.shape[0] == 0:
210
+ raise ValueError(
211
+ "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
212
+ f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
213
+ " of masks that you pass is divisible by the total requested batch size."
214
+ )
215
+ mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
216
+
217
+ mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
218
+
219
+ if masked_image is not None and masked_image.shape[1] == 4:
220
+ masked_image_latents = masked_image
221
+ else:
222
+ masked_image_latents = None
223
+
224
+ if masked_image is not None:
225
+ if masked_image_latents is None:
226
+ masked_image = masked_image.to(device=device, dtype=dtype)
227
+ masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
228
+
229
+ if masked_image_latents.shape[0] < batch_size:
230
+ if not batch_size % masked_image_latents.shape[0] == 0:
231
+ raise ValueError(
232
+ "The passed images and the required batch size don't match. Images are supposed to be duplicated"
233
+ f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
234
+ " Make sure the number of images that you pass is divisible by the total requested batch size."
235
+ )
236
+ masked_image_latents = masked_image_latents.repeat(
237
+ batch_size // masked_image_latents.shape[0], 1, 1, 1
238
+ )
239
+
240
+ masked_image_latents = (
241
+ torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
242
+ )
243
+
244
+ # aligning device to prevent device errors when concating it with the latent model input
245
+ masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
246
+
247
+ return mask, masked_image_latents
248
+
249
+ @torch.no_grad()
250
+ def __call__(
251
+ self,
252
+ prompt: Union[str, List[str]] = None,
253
+ prompt_2: Optional[Union[str, List[str]]] = None,
254
+ image: PipelineImageInput = None,
255
+ mask_image: PipelineImageInput = None,
256
+ masked_image_latents: torch.Tensor = None,
257
+ height: Optional[int] = None,
258
+ width: Optional[int] = None,
259
+ padding_mask_crop: Optional[int] = None,
260
+ strength: float = 0.9999,
261
+ num_inference_steps: int = 50,
262
+ timesteps: List[int] = None,
263
+ sigmas: List[float] = None,
264
+ denoising_start: Optional[float] = None,
265
+ denoising_end: Optional[float] = None,
266
+ guidance_scale: float = 7.5,
267
+ negative_prompt: Optional[Union[str, List[str]]] = None,
268
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
269
+ num_images_per_prompt: Optional[int] = 1,
270
+ eta: float = 0.0,
271
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
272
+ latents: Optional[torch.Tensor] = None,
273
+ prompt_embeds: Optional[torch.Tensor] = None,
274
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
275
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
276
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
277
+ ip_adapter_image: Optional[PipelineImageInput] = None,
278
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
279
+ output_type: Optional[str] = "pil",
280
+ return_dict: bool = True,
281
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
282
+ guidance_rescale: float = 0.0,
283
+ original_size: Tuple[int, int] = None,
284
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
285
+ target_size: Tuple[int, int] = None,
286
+ negative_original_size: Optional[Tuple[int, int]] = None,
287
+ negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
288
+ negative_target_size: Optional[Tuple[int, int]] = None,
289
+ aesthetic_score: float = 6.0,
290
+ negative_aesthetic_score: float = 2.5,
291
+ clip_skip: Optional[int] = None,
292
+ callback_on_step_end: Optional[
293
+ Union[
294
+ Callable[[int, int, Dict], None],
295
+ PipelineCallback,
296
+ MultiPipelineCallbacks
297
+ ]
298
+ ] = None,
299
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
300
+ ensure_image_consistency: bool = False,
301
+ dilate_latent_mask: int | None = None,
302
+ **kwargs
303
+ ):
304
+ r"""
305
+ Function invoked when calling the pipeline for generation.
306
+
307
+ Args:
308
+ prompt (`str` or `List[str]`, *optional*):
309
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
310
+ instead.
311
+ prompt_2 (`str` or `List[str]`, *optional*):
312
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
313
+ used in both text-encoders
314
+ image (`PIL.Image.Image`):
315
+ `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
316
+ be masked out with `mask_image` and repainted according to `prompt`.
317
+ mask_image (`PIL.Image.Image`):
318
+ `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
319
+ repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
320
+ to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
321
+ instead of 3, so the expected shape would be `(B, H, W, 1)`.
322
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
323
+ The height in pixels of the generated image. This is set to 1024 by default for the best results.
324
+ Anything below 512 pixels won't work well for
325
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
326
+ and checkpoints that are not specifically fine-tuned on low resolutions.
327
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
328
+ The width in pixels of the generated image. This is set to 1024 by default for the best results.
329
+ Anything below 512 pixels won't work well for
330
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
331
+ and checkpoints that are not specifically fine-tuned on low resolutions.
332
+ padding_mask_crop (`int`, *optional*, defaults to `None`):
333
+ The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
334
+ image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
335
+ with the same aspect ration of the image and contains all masked area, and then expand that area based
336
+ on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
337
+ resizing to the original image size for inpainting. This is useful when the masked area is small while
338
+ the image is large and contain information irrelevant for inpainting, such as background.
339
+ strength (`float`, *optional*, defaults to 0.9999):
340
+ Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
341
+ between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
342
+ `strength`. The number of denoising steps depends on the amount of noise initially added. When
343
+ `strength` is 1, added noise will be maximum and the denoising process will run for the full number of
344
+ iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores the masked
345
+ portion of the reference `image`. Note that in the case of `denoising_start` being declared as an
346
+ integer, the value of `strength` will be ignored.
347
+ num_inference_steps (`int`, *optional*, defaults to 50):
348
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
349
+ expense of slower inference.
350
+ timesteps (`List[int]`, *optional*):
351
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
352
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
353
+ passed will be used. Must be in descending order.
354
+ sigmas (`List[float]`, *optional*):
355
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
356
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
357
+ will be used.
358
+ denoising_start (`float`, *optional*):
359
+ When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
360
+ bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
361
+ it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
362
+ strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
363
+ is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
364
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
365
+ denoising_end (`float`, *optional*):
366
+ When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
367
+ completed before it is intentionally prematurely terminated. As a result, the returned sample will
368
+ still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
369
+ denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
370
+ final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
371
+ forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
372
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
373
+ guidance_scale (`float`, *optional*, defaults to 7.5):
374
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
375
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
376
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
377
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
378
+ usually at the expense of lower image quality.
379
+ negative_prompt (`str` or `List[str]`, *optional*):
380
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
381
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
382
+ less than `1`).
383
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
384
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
385
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
386
+ prompt_embeds (`torch.Tensor`, *optional*):
387
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
388
+ provided, text embeddings will be generated from `prompt` input argument.
389
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
390
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
391
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
392
+ argument.
393
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
394
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
395
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
396
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
397
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
398
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
399
+ input argument.
400
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
401
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
402
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
403
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
404
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
405
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
406
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
407
+ The number of images to generate per prompt.
408
+ eta (`float`, *optional*, defaults to 0.0):
409
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
410
+ [`schedulers.DDIMScheduler`], will be ignored for others.
411
+ generator (`torch.Generator`, *optional*):
412
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
413
+ to make generation deterministic.
414
+ latents (`torch.Tensor`, *optional*):
415
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
416
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
417
+ tensor will ge generated by sampling using the supplied random `generator`.
418
+ output_type (`str`, *optional*, defaults to `"pil"`):
419
+ The output format of the generate image. Choose between
420
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
421
+ return_dict (`bool`, *optional*, defaults to `True`):
422
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
423
+ plain tuple.
424
+ cross_attention_kwargs (`dict`, *optional*):
425
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
426
+ `self.processor` in
427
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
428
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
429
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
430
+ `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
431
+ explained in section 2.2 of
432
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
433
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
434
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
435
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
436
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
437
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
438
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
439
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
440
+ not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
441
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
442
+ negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
443
+ To negatively condition the generation process based on a specific image resolution. Part of SDXL's
444
+ micro-conditioning as explained in section 2.2 of
445
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
446
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
447
+ negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
448
+ To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
449
+ micro-conditioning as explained in section 2.2 of
450
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
451
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
452
+ negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
453
+ To negatively condition the generation process based on a target image resolution. It should be as same
454
+ as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
455
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
456
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
457
+ aesthetic_score (`float`, *optional*, defaults to 6.0):
458
+ Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
459
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
460
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
461
+ negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
462
+ Part of SDXL's micro-conditioning as explained in section 2.2 of
463
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
464
+ simulate an aesthetic score of the generated image by influencing the negative text condition.
465
+ clip_skip (`int`, *optional*):
466
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
467
+ the output of the pre-final layer will be used for computing the prompt embeddings.
468
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
469
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
470
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
471
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
472
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
473
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
474
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
475
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
476
+ `._callback_tensor_inputs` attribute of your pipeline class.
477
+
478
+ Examples:
479
+
480
+ Returns:
481
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
482
+ [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
483
+ `tuple. `tuple. When returning a tuple, the first element is a list with the generated images.
484
+ """
485
+
486
+ callback = kwargs.pop("callback", None)
487
+ callback_steps = kwargs.pop("callback_steps", None)
488
+
489
+ if callback is not None:
490
+ deprecate(
491
+ "callback",
492
+ "1.0.0",
493
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
494
+ )
495
+ if callback_steps is not None:
496
+ deprecate(
497
+ "callback_steps",
498
+ "1.0.0",
499
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
500
+ )
501
+
502
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
503
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
504
+
505
+ # 0. Default height and width to unet
506
+ height = height or self.unet.ocnfig.sample_size * self.vae_scale_factor
507
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
508
+
509
+ if prompt is None and prompt_embeds is None:
510
+ prompt = ''
511
+ guidance_scale = 0.0
512
+
513
+ # 1. Check inputs
514
+ self.check_inputs(
515
+ prompt,
516
+ prompt_2,
517
+ image,
518
+ mask_image,
519
+ height,
520
+ width,
521
+ strength,
522
+ callback_steps,
523
+ output_type,
524
+ negative_prompt,
525
+ negative_prompt_2,
526
+ prompt_embeds,
527
+ negative_prompt_embeds,
528
+ ip_adapter_image,
529
+ ip_adapter_image_embeds,
530
+ callback_on_step_end_tensor_inputs,
531
+ padding_mask_crop,
532
+ )
533
+
534
+ self._guidance_scale = guidance_scale
535
+ self._guidance_rescale = guidance_rescale
536
+ self._clip_skip = clip_skip
537
+ self._cross_attention_kwargs = cross_attention_kwargs
538
+ self._denoising_end = denoising_end
539
+ self._denoising_start = denoising_start
540
+ self._interrupt = False
541
+
542
+ # 2. Define call parameters
543
+ if prompt is not None and isinstance(prompt, str):
544
+ batch_size = 1
545
+ elif prompt is not None and isinstance(prompt, list):
546
+ batch_size = len(prompt)
547
+ else:
548
+ batch_size = prompt_embeds.shape[0]
549
+
550
+ device = self._execution_device
551
+
552
+ # 3. Encode input prompt
553
+ text_encoder_lora_scale = (
554
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
555
+ )
556
+
557
+ (
558
+ prompt_embeds,
559
+ negative_prompt_embeds,
560
+ pooled_prompt_embeds,
561
+ negative_pooled_prompt_embeds,
562
+ ) = self.encode_prompt(
563
+ prompt=prompt,
564
+ prompt_2=prompt_2,
565
+ device=device,
566
+ num_images_per_prompt=num_images_per_prompt,
567
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
568
+ negative_prompt=negative_prompt,
569
+ negative_prompt_2=negative_prompt_2,
570
+ prompt_embeds=prompt_embeds,
571
+ negative_prompt_embeds=negative_prompt_embeds,
572
+ pooled_prompt_embeds=pooled_prompt_embeds,
573
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
574
+ lora_scale=text_encoder_lora_scale,
575
+ clip_skip=self.clip_skip,
576
+ )
577
+
578
+ # 4. set timesteps
579
+ def denoising_value_valid(dnv):
580
+ return isinstance(dnv, float) and 0 < dnv < 1
581
+
582
+ timesteps, num_inference_steps = retrieve_timesteps(
583
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
584
+ )
585
+ timesteps, num_inference_steps = self.get_timesteps(
586
+ num_inference_steps,
587
+ strength,
588
+ device,
589
+ denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None,
590
+ )
591
+ # check that number of inference steps is not < 1 - as this doesn't make sense
592
+ if num_inference_steps < 1:
593
+ raise ValueError(
594
+ f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
595
+ f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
596
+ )
597
+ # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
598
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
599
+ # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
600
+ is_strength_max = strength == 1.0
601
+
602
+ # 5. Preprocess mask and image
603
+ if padding_mask_crop is not None:
604
+ crops_coords = self.mask_processor.get_crop_region(
605
+ mask_image,
606
+ width,
607
+ height,
608
+ pad=padding_mask_crop
609
+ )
610
+ resize_mode = "fill"
611
+ else:
612
+ crops_coords = None
613
+ resize_mode = "default"
614
+
615
+ original_image = image
616
+ init_image = self.image_processor.preprocess(
617
+ image,
618
+ height=height,
619
+ width=width,
620
+ crops_coords=crops_coords,
621
+ resize_mode=resize_mode
622
+ )
623
+ init_image = init_image.to(dtype=torch.float32)
624
+
625
+ mask = self.mask_processor.preprocess(
626
+ mask_image,
627
+ height=height,
628
+ width=width,
629
+ resize_mode=resize_mode,
630
+ crops_coords=crops_coords
631
+ )
632
+
633
+ if masked_image_latents is not None:
634
+ masked_image = masked_image_latents
635
+ elif init_image.shape[1] == 4:
636
+ # if images are in latent space, we can't mask it
637
+ masked_image = None
638
+ else:
639
+ # masked_image = init_image * (mask < 0.5)
640
+ masked_image = torch.where(mask < 1.0, init_image, 0.5)
641
+
642
+ # 6. Prepare latent variables
643
+ num_channels_latents = self.vae.config.latent_channels
644
+ num_channels_unet = self.unet.config.in_channels
645
+ # return_image_latents = num_channels_unet == 4
646
+ return_image_latents = ensure_image_consistency
647
+
648
+ add_noise = True if self.denoising_start is None else False
649
+ latents_outputs = self.prepare_latents(
650
+ batch_size * num_images_per_prompt,
651
+ num_channels_latents,
652
+ height,
653
+ width,
654
+ prompt_embeds.dtype,
655
+ device,
656
+ generator,
657
+ latents,
658
+ image=init_image,
659
+ timestep=latent_timestep,
660
+ is_strength_max=is_strength_max,
661
+ add_noise=add_noise,
662
+ return_noise=True,
663
+ return_image_latents=return_image_latents,
664
+ )
665
+
666
+ if return_image_latents:
667
+ latents, noise, image_latents = latents_outputs
668
+ else:
669
+ latents, noise = latents_outputs
670
+
671
+ # 7. Prepare mask latent variables
672
+ latent_mask, masked_image_latents = self.prepare_mask_latents(
673
+ mask,
674
+ masked_image,
675
+ batch_size * num_images_per_prompt,
676
+ height,
677
+ width,
678
+ prompt_embeds.dtype,
679
+ device,
680
+ generator,
681
+ self.do_classifier_free_guidance,
682
+ dilate_latent_mask
683
+ )
684
+
685
+ # 8. Check that sizes of mask, masked image and latents match
686
+ if num_channels_unet == 9:
687
+ # default case for runwayml/stable-diffusion-inpainting
688
+ num_channels_mask = mask.shape[1]
689
+ num_channels_masked_image = masked_image_latents.shape[1]
690
+ if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
691
+ raise ValueError(
692
+ f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
693
+ f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
694
+ f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
695
+ f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
696
+ " `pipeline.unet` or your `mask_image` or `image` input."
697
+ )
698
+ elif num_channels_unet != 4:
699
+ raise ValueError(
700
+ f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
701
+ )
702
+ # 8.1 Prepare extra step kwargs.
703
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
704
+
705
+ # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
706
+ height, width = latents.shape[-2:]
707
+ height = height * self.vae_scale_factor
708
+ width = width * self.vae_scale_factor
709
+
710
+ original_size = original_size or (height, width)
711
+ target_size = target_size or (height, width)
712
+
713
+ # 10. Prepare added time ids & embeddings
714
+ if negative_original_size is None:
715
+ negative_original_size = original_size
716
+ if negative_target_size is None:
717
+ negative_target_size = target_size
718
+
719
+ add_text_embeds = pooled_prompt_embeds
720
+ if self.text_encoder_2 is None:
721
+ text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
722
+ else:
723
+ text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
724
+
725
+ add_time_ids, add_neg_time_ids = self._get_add_time_ids(
726
+ original_size,
727
+ crops_coords_top_left,
728
+ target_size,
729
+ aesthetic_score,
730
+ negative_aesthetic_score,
731
+ negative_original_size,
732
+ negative_crops_coords_top_left,
733
+ negative_target_size,
734
+ dtype=prompt_embeds.dtype,
735
+ text_encoder_projection_dim=text_encoder_projection_dim,
736
+ )
737
+ add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
738
+
739
+ if self.do_classifier_free_guidance:
740
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
741
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
742
+ add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
743
+ add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
744
+
745
+ prompt_embeds = prompt_embeds.to(device)
746
+ add_text_embeds = add_text_embeds.to(device)
747
+ add_time_ids = add_time_ids.to(device)
748
+
749
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
750
+ image_embeds = self.prepare_ip_adapter_image_embeds(
751
+ ip_adapter_image,
752
+ ip_adapter_image_embeds,
753
+ device,
754
+ batch_size * num_images_per_prompt,
755
+ self.do_classifier_free_guidance,
756
+ )
757
+
758
+ # 11. Denoising loop
759
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
760
+
761
+ if (
762
+ self.denoising_end is not None
763
+ and self.denoising_start is not None
764
+ and denoising_value_valid(self.denoising_end)
765
+ and denoising_value_valid(self.denoising_start)
766
+ and self.denoising_start >= self.denoising_end
767
+ ):
768
+ raise ValueError(
769
+ f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
770
+ + f" {self.denoising_end} when using type float."
771
+ )
772
+ elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
773
+ discrete_timestep_cutoff = int(
774
+ round(
775
+ self.scheduler.config.num_train_timesteps
776
+ - (self.denoising_end * self.scheduler.config.num_train_timesteps)
777
+ )
778
+ )
779
+ num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
780
+ timesteps = timesteps[:num_inference_steps]
781
+
782
+ # 11.1 Optionally get Guidance Scale Embedding
783
+ timestep_cond = None
784
+ if self.unet.config.time_cond_proj_dim is not None:
785
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
786
+ timestep_cond = self.get_guidance_scale_embedding(
787
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
788
+ ).to(device=device, dtype=latents.dtype)
789
+
790
+ self._num_timesteps = len(timesteps)
791
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
792
+ for i, t in enumerate(timesteps):
793
+ if self.interrupt:
794
+ continue
795
+ # expand the latents if we are doing classifier free guidance
796
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
797
+
798
+ # concat latents, mask, masked_image_latents in the channel dimension
799
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
800
+ latent_model_input = torch.cat(
801
+ [
802
+ latent_model_input,
803
+ latent_mask,
804
+ masked_image_latents
805
+ ],
806
+ dim=1
807
+ )
808
+
809
+ # predict the noise residual
810
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
811
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
812
+ added_cond_kwargs["image_embeds"] = image_embeds
813
+
814
+ noise_pred = self.unet(
815
+ latent_model_input,
816
+ t,
817
+ encoder_hidden_states=prompt_embeds,
818
+ timestep_cond=timestep_cond,
819
+ cross_attention_kwargs=self.cross_attention_kwargs,
820
+ added_cond_kwargs=added_cond_kwargs,
821
+ return_dict=False
822
+ )[0]
823
+
824
+ # perform guidance
825
+ if self.do_classifier_free_guidance:
826
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
827
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
828
+
829
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
830
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
831
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
832
+
833
+ # compute the previous noisy sample x_t -> x_t-1
834
+ latents_dtype = latents.dtype
835
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
836
+ if latents.dtype != latents_dtype:
837
+ if torch.backends.mps.is_available():
838
+ # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
839
+ latents = latents.to(latents_dtype)
840
+
841
+ if ensure_image_consistency:
842
+ init_latents_proper = image_latents
843
+ if self.do_classifier_free_guidance:
844
+ init_mask, _ = latent_mask.chunk(2)
845
+ else:
846
+ init_mask = latent_mask
847
+
848
+ init_mask = init_mask.bool().to(latents)
849
+
850
+ if i < len(timesteps) - 1:
851
+ noise_timestep = timesteps[i + 1]
852
+ init_latents_proper = self.scheduler.add_noise(
853
+ init_latents_proper, noise, torch.tensor([noise_timestep])
854
+ )
855
+
856
+ latents = (1 - init_mask) * init_latents_proper + init_mask * latents
857
+
858
+ if callback_on_step_end is not None:
859
+ callback_kwargs = {}
860
+ for k in callback_on_step_end_tensor_inputs:
861
+ callback_kwargs[k] = locals()[k]
862
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
863
+
864
+ latents = callback_outputs.pop("latents", latents)
865
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
866
+ add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
867
+ add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
868
+ mask = callback_outputs.pop("mask", mask)
869
+ masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
870
+
871
+ # call the callback, if provided
872
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
873
+ progress_bar.update()
874
+ if callback is not None and i % callback_steps == 0:
875
+ step_idx = i // getattr(self.scheduler, "order", 1)
876
+ callback(step_idx, t, latents)
877
+
878
+ if XLA_AVAILABLE:
879
+ xm.mark_step()
880
+
881
+ if not output_type == "latent":
882
+ # make sure the VAE is in float32 mode, as it overflows in float16
883
+ needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
884
+
885
+ if needs_upcasting:
886
+ self.upcast_vae()
887
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
888
+ elif latents.dtype != self.vae.dtype:
889
+ if torch.backends.mps.is_available():
890
+ # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
891
+ self.vae = self.vae.to(latents.dtype)
892
+
893
+ # unscale/denormalize the latents
894
+ # denormalize with the mean and std if available and not None
895
+ has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
896
+ has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
897
+ if has_latents_mean and has_latents_std:
898
+ latents_mean = (
899
+ torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
900
+ )
901
+ latents_std = (
902
+ torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
903
+ )
904
+ latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
905
+ else:
906
+ latents = latents / self.vae.config.scaling_factor
907
+
908
+ image = self.vae.decode(latents, return_dict=False)[0]
909
+
910
+ if ensure_image_consistency:
911
+ init_image = init_image.to(image)
912
+ init_mask = mask.bool().to(image)
913
+ kernel_size = 11
914
+ init_mask = F.max_pool2d(init_mask, kernel_size=kernel_size, stride=1, padding=kernel_size // 2)
915
+ init_mask = F.avg_pool2d(init_mask, kernel_size=kernel_size, stride=1, padding=kernel_size // 2)
916
+ image = (1 - init_mask) * init_image + init_mask * image.clamp(-1, 1)
917
+
918
+ # cast back to fp16 if needed
919
+ if needs_upcasting:
920
+ self.vae.to(dtype=torch.float16)
921
+ else:
922
+ return StableDiffusionXLPipelineOutput(images=latents)
923
+
924
+ # apply watermark if available
925
+ if self.watermark is not None:
926
+ image = self.watermark.apply_watermark(image)
927
+
928
+ image = self.image_processor.postprocess(image, output_type=output_type)
929
+
930
+ if padding_mask_crop is not None:
931
+ image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
932
+
933
+ # Offload all models
934
+ self.maybe_free_model_hooks()
935
+
936
+ if not return_dict:
937
+ return (image,)
938
+
939
+ return StableDiffusionXLPipelineOutput(images=image)
text_encoder/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPTextModel"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 0,
7
+ "dropout": 0.0,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "quick_gelu",
10
+ "hidden_size": 768,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 77,
16
+ "model_type": "clip_text_model",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "projection_dim": 768,
21
+ "torch_dtype": "bfloat16",
22
+ "transformers_version": "4.51.3",
23
+ "vocab_size": 49408
24
+ }
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f740d60658b082057a21d1c4c75ab7533c9d602da727063e33d0df7dcafd15e1
3
+ size 246144352
text_encoder_2/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPTextModelWithProjection"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 0,
7
+ "dropout": 0.0,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_size": 1280,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5120,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 77,
16
+ "model_type": "clip_text_model",
17
+ "num_attention_heads": 20,
18
+ "num_hidden_layers": 32,
19
+ "pad_token_id": 1,
20
+ "projection_dim": 1280,
21
+ "torch_dtype": "bfloat16",
22
+ "transformers_version": "4.51.3",
23
+ "vocab_size": 49408
24
+ }
text_encoder_2/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:670dc25b730c9d8894bbd45562eb7d98a8b896b4d5851dd3ce2f1f4dd6b5b022
3
+ size 1389382688
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "49406": {
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49407": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "bos_token": "<|startoftext|>",
22
+ "clean_up_tokenization_spaces": true,
23
+ "do_lower_case": true,
24
+ "eos_token": "<|endoftext|>",
25
+ "errors": "replace",
26
+ "extra_special_tokens": {},
27
+ "model_max_length": 77,
28
+ "pad_token": "<|endoftext|>",
29
+ "tokenizer_class": "CLIPTokenizer",
30
+ "unk_token": "<|endoftext|>"
31
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_2/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_2/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "!",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer_2/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "!",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49406": {
13
+ "content": "<|startoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "49407": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "bos_token": "<|startoftext|>",
30
+ "clean_up_tokenization_spaces": true,
31
+ "do_lower_case": true,
32
+ "eos_token": "<|endoftext|>",
33
+ "errors": "replace",
34
+ "extra_special_tokens": {},
35
+ "model_max_length": 77,
36
+ "pad_token": "!",
37
+ "tokenizer_class": "CLIPTokenizer",
38
+ "unk_token": "<|endoftext|>"
39
+ }
tokenizer_2/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
unet/config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.33.1",
4
+ "_name_or_path": "checkpoints/WAI-NSFW-illustrious-SDXL-V14.0",
5
+ "act_fn": "silu",
6
+ "addition_embed_type": "text_time",
7
+ "addition_embed_type_num_heads": 64,
8
+ "addition_time_embed_dim": 256,
9
+ "attention_head_dim": [
10
+ 5,
11
+ 10,
12
+ 20
13
+ ],
14
+ "attention_type": "default",
15
+ "block_out_channels": [
16
+ 320,
17
+ 640,
18
+ 1280
19
+ ],
20
+ "center_input_sample": false,
21
+ "class_embed_type": null,
22
+ "class_embeddings_concat": false,
23
+ "conv_in_kernel": 3,
24
+ "conv_out_kernel": 3,
25
+ "cross_attention_dim": 2048,
26
+ "cross_attention_norm": null,
27
+ "down_block_types": [
28
+ "DownBlock2D",
29
+ "CrossAttnDownBlock2D",
30
+ "CrossAttnDownBlock2D"
31
+ ],
32
+ "downsample_padding": 1,
33
+ "dropout": 0.0,
34
+ "dual_cross_attention": false,
35
+ "encoder_hid_dim": null,
36
+ "encoder_hid_dim_type": null,
37
+ "flip_sin_to_cos": true,
38
+ "freq_shift": 0,
39
+ "in_channels": 9,
40
+ "layers_per_block": 2,
41
+ "mid_block_only_cross_attention": null,
42
+ "mid_block_scale_factor": 1,
43
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
44
+ "norm_eps": 1e-05,
45
+ "norm_num_groups": 32,
46
+ "num_attention_heads": null,
47
+ "num_class_embeds": null,
48
+ "only_cross_attention": false,
49
+ "out_channels": 4,
50
+ "projection_class_embeddings_input_dim": 2816,
51
+ "resnet_out_scale_factor": 1.0,
52
+ "resnet_skip_time_act": false,
53
+ "resnet_time_scale_shift": "default",
54
+ "reverse_transformer_layers_per_block": null,
55
+ "sample_size": 128,
56
+ "time_cond_proj_dim": null,
57
+ "time_embedding_act_fn": null,
58
+ "time_embedding_dim": null,
59
+ "time_embedding_type": "positional",
60
+ "timestep_post_act": null,
61
+ "transformer_layers_per_block": [
62
+ 1,
63
+ 2,
64
+ 10
65
+ ],
66
+ "up_block_types": [
67
+ "CrossAttnUpBlock2D",
68
+ "CrossAttnUpBlock2D",
69
+ "UpBlock2D"
70
+ ],
71
+ "upcast_attention": null,
72
+ "use_linear_projection": true
73
+ }
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b20347ccb331bc183abf1f6ef8c723a5c49f1cd47ce594fea3fbc58cb93e6008
3
+ size 5135180240
vae/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.33.1",
4
+ "_name_or_path": "checkpoints/WAI-NSFW-illustrious-SDXL-V14.0/vae",
5
+ "act_fn": "silu",
6
+ "block_out_channels": [
7
+ 128,
8
+ 256,
9
+ 512,
10
+ 512
11
+ ],
12
+ "down_block_types": [
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D",
16
+ "DownEncoderBlock2D"
17
+ ],
18
+ "force_upcast": true,
19
+ "in_channels": 3,
20
+ "latent_channels": 4,
21
+ "latents_mean": null,
22
+ "latents_std": null,
23
+ "layers_per_block": 2,
24
+ "mid_block_add_attention": true,
25
+ "norm_num_groups": 32,
26
+ "out_channels": 3,
27
+ "sample_size": 1024,
28
+ "scaling_factor": 0.13025,
29
+ "shift_factor": null,
30
+ "up_block_types": [
31
+ "UpDecoderBlock2D",
32
+ "UpDecoderBlock2D",
33
+ "UpDecoderBlock2D",
34
+ "UpDecoderBlock2D"
35
+ ],
36
+ "use_post_quant_conv": true,
37
+ "use_quant_conv": true
38
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9974e9bd90b834b8cb74fa53a63a64643fbeb02438301ac1adde0c042f70fd24
3
+ size 167335590