File size: 5,897 Bytes
94a2309 cf5bfcc 6dab279 cf5bfcc 6dab279 aa2eabf 6dab279 cf5bfcc 94a2309 cf5bfcc aa2eabf 341cc8c 94a2309 b12d34d 94a2309 6dab279 94a2309 cf5bfcc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
---
license: apache-2.0
pipeline_tag: text-to-image
---
# Simple Diffusion XS
*XS Size, Excess Quality*
At AiArtLab, we strive to create a free, compact (1.7b) and fast (3 sec/image) model that can be trained on consumer graphics cards.
- We use U-Net for its high efficiency.
- We have chosen the multilingual/multimodal encoder Mexma-SigLIP, which supports 80 languages.
- We use the AuraDiffusion 16ch-VAE architecture, which preserves details and anatomy.
- The model was trained (~1 month on 4xA5000) on approximately 1 million images with various resolutions and styles, including anime and realistic photos.
### Model Limitations:
- Limited concept coverage due to the small dataset.
- The Image2Image functionality requires further training.
## Acknowledgments
- **[Stan](https://t.me/Stangle)** — Key investor. Thank you for believing in us when others called it madness.
- **Captainsaturnus**
- **Love. Death. Transformers.**
## Datasets
- **[CaptionEmporium](https://huggingface.co/CaptionEmporium)**
## Training budget
Around ~$1k for now, but research budget ~$10k
## Donations
Please contact with us if you may provide some GPU's or money on training
DOGE: DEw2DR8C7BnF8GgcrfTzUjSnGkuMeJhg83
BTC: 3JHv9Hb8kEW8zMAccdgCdZGfrHeMhH1rpN
## Contacts
[recoilme](https://t.me/recoilme)
Train status, in progress: [wandb](https://wandb.ai/recoilme/micro)

## Example
```python
import torch
from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
from transformers import AutoModel, AutoTokenizer
from PIL import Image
from tqdm.auto import tqdm
import os
def encode_prompt(prompt, negative_prompt, device, dtype):
if negative_prompt is None:
negative_prompt = ""
with torch.no_grad():
positive_inputs = tokenizer(
prompt,
return_tensors="pt",
padding="max_length",
max_length=512,
truncation=True,
).to(device)
positive_embeddings = text_model.encode_texts(
positive_inputs.input_ids, positive_inputs.attention_mask
)
if positive_embeddings.ndim == 2:
positive_embeddings = positive_embeddings.unsqueeze(1)
positive_embeddings = positive_embeddings.to(device, dtype=dtype)
negative_inputs = tokenizer(
negative_prompt,
return_tensors="pt",
padding="max_length",
max_length=512,
truncation=True,
).to(device)
negative_embeddings = text_model.encode_texts(negative_inputs.input_ids, negative_inputs.attention_mask)
if negative_embeddings.ndim == 2:
negative_embeddings = negative_embeddings.unsqueeze(1)
negative_embeddings = negative_embeddings.to(device, dtype=dtype)
return torch.cat([negative_embeddings, positive_embeddings], dim=0)
def generate_latents(embeddings, height=576, width=576, num_inference_steps=50, guidance_scale=5.5):
with torch.no_grad():
device, dtype = embeddings.device, embeddings.dtype
half = embeddings.shape[0] // 2
latent_shape = (half, 16, height // 8, width // 8)
latents = torch.randn(latent_shape, device=device, dtype=dtype)
embeddings = embeddings.repeat_interleave(half, dim=0)
scheduler.set_timesteps(num_inference_steps)
for t in tqdm(scheduler.timesteps, desc="Генерация"):
latent_model_input = torch.cat([latents] * 2)
latent_model_input = scheduler.scale_model_input(latent_model_input, t)
noise_pred = unet(latent_model_input, t, embeddings).sample
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
latents = scheduler.step(noise_pred, t, latents).prev_sample
return latents
def decode_latents(latents, vae, output_type="pil"):
latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
with torch.no_grad():
images = vae.decode(latents).sample
images = (images / 2 + 0.5).clamp(0, 1)
images = images.cpu().permute(0, 2, 3, 1).float().numpy()
if output_type == "pil":
images = (images * 255).round().astype("uint8")
images = [Image.fromarray(image) for image in images]
return images
# Example usage:
if __name__ == "__main__":
device = "cuda"
dtype = torch.float16
prompt = "girl"
negative_prompt = "bad quality"
tokenizer = AutoTokenizer.from_pretrained("visheratin/mexma-siglip")
text_model = AutoModel.from_pretrained(
"visheratin/mexma-siglip", torch_dtype=dtype, trust_remote_code=True
).to(device, dtype=dtype).eval()
embeddings = encode_prompt(prompt, negative_prompt, device, dtype)
pipeid = "AiArtLab/sdxs"
variant = "fp16"
unet = UNet2DConditionModel.from_pretrained(pipeid, subfolder="unet", variant=variant).to(device, dtype=dtype).eval()
vae = AutoencoderKL.from_pretrained(pipeid, subfolder="vae", variant=variant).to(device, dtype=dtype).eval()
scheduler = DDPMScheduler.from_pretrained(pipeid, subfolder="scheduler")
height, width = 576, 384
num_inference_steps = 40
output_folder, project_name = "samples", "sdxs"
latents = generate_latents(
embeddings=embeddings,
height=height,
width=width,
num_inference_steps = num_inference_steps
)
images = decode_latents(latents, vae)
os.makedirs(output_folder, exist_ok=True)
for idx, image in enumerate(images):
image.save(f"{output_folder}/{project_name}_{idx}.jpg")
print("Images generated and saved to:", output_folder)
``` |