recoilme commited on Jun 24

Commit

94a2309

0 Parent(s):

Fresh start

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +39 -0
.gitignore +13 -0
README.md +178 -0
TRAIN.md +44 -0
budget.jpg +3 -0
cherrypick-vavae.ipynb +3 -0
dataset_fromfolder.py +386 -0
model_index.json +3 -0
pipeline_sdxs.py +295 -0
promo.png +3 -0
requirements.txt +11 -0
result_grid.jpg +3 -0
samples/unet_192x384_0.jpg +3 -0
samples/unet_256x384_0.jpg +3 -0
samples/unet_320x384_0.jpg +3 -0
samples/unet_384x192_0.jpg +3 -0
samples/unet_384x256_0.jpg +3 -0
samples/unet_384x320_0.jpg +3 -0
samples/unet_384x384_0.jpg +3 -0
scheduler/scheduler_config.json +3 -0
src/captions_moondream2.ipynb +3 -0
src/captions_moondream2_wd3.ipynb +3 -0
src/captions_qwen2-vl-7b.py +261 -0
src/captions_wd.ipynb +3 -0
src/cherrypick.ipynb +3 -0
src/cuda.ipynb +3 -0
src/dataset_clean.ipynb +3 -0
src/dataset_combine.py +68 -0
src/dataset_fromzip.ipynb +3 -0
src/dataset_imagenet.ipynb +3 -0
src/dataset_laion_coco.ipynb +3 -0
src/dataset_mjnj.ipynb +3 -0
src/dataset_mnist-te.ipynb +3 -0
src/dataset_mnist.ipynb +3 -0
src/dataset_sample.ipynb +3 -0
src/inference.ipynb +3 -0
src/sdxs_create-vavae.ipynb +3 -0
src/sdxs_create.ipynb +3 -0
src/sdxs_create_simple.ipynb +3 -0
src/sdxs_create_unet.ipynb +3 -0
src/sdxs_sdxxs_transfer.ipynb +3 -0
test.ipynb +3 -0
text_encoder/config.json +3 -0
text_encoder/model.fp16.safetensors +3 -0
text_projector/config.json +3 -0
text_projector/model.safetensors +3 -0
tokenizer/special_tokens_map.json +3 -0
tokenizer/tokenizer.json +3 -0
tokenizer/tokenizer_config.json +3 -0
train-Copy1.py +789 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,39 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.ipynb filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+# Jupyter Notebook
+__pycache__/
+*.pyc
+.ipynb_checkpoints/
+*.ipynb_checkpoints/*
+.ipynb_checkpoints/*
+src/samples
+# cache
+cache
+datasets
+test
+wandb
+nohup.out

README.md ADDED Viewed

	@@ -0,0 +1,178 @@

+---
+license: apache-2.0
+pipeline_tag: text-to-image
+---
+# Simple Diffusion XS
+*XS Size, Excess Quality*
+Train status, in progress: [wandb](https://wandb.ai/recoilme/unet)
+![result](result_grid.jpg)
+## Example
+```python
+import torch
+from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
+from transformers import AutoModel, AutoTokenizer
+from PIL import Image
+from tqdm.auto import tqdm
+import os
+def encode_prompt(prompt, negative_prompt, device, dtype):
+    if negative_prompt is None:
+        negative_prompt = ""
+    with torch.no_grad():
+        positive_inputs = tokenizer(
+            prompt,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=512,
+            truncation=True,
+        ).to(device)
+        positive_embeddings = text_model.encode_texts(
+            positive_inputs.input_ids, positive_inputs.attention_mask
+        )
+        if positive_embeddings.ndim == 2:
+            positive_embeddings = positive_embeddings.unsqueeze(1)
+        positive_embeddings = positive_embeddings.to(device, dtype=dtype)
+        negative_inputs = tokenizer(
+            negative_prompt,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=512,
+            truncation=True,
+        ).to(device)
+        negative_embeddings = text_model.encode_texts(negative_inputs.input_ids, negative_inputs.attention_mask)
+        if negative_embeddings.ndim == 2:
+            negative_embeddings = negative_embeddings.unsqueeze(1)
+        negative_embeddings = negative_embeddings.to(device, dtype=dtype)
+    return torch.cat([negative_embeddings, positive_embeddings], dim=0)
+def generate_latents(embeddings, height=576, width=576, num_inference_steps=50, guidance_scale=5.5):
+    with torch.no_grad():
+        device, dtype = embeddings.device, embeddings.dtype
+        half = embeddings.shape[0] // 2
+        latent_shape = (half, 16, height // 8, width // 8)
+        latents = torch.randn(latent_shape, device=device, dtype=dtype)
+        embeddings = embeddings.repeat_interleave(half, dim=0)
+        scheduler.set_timesteps(num_inference_steps)
+        for t in tqdm(scheduler.timesteps, desc="Генерация"):
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+            noise_pred = unet(latent_model_input, t, embeddings).sample
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            latents = scheduler.step(noise_pred, t, latents).prev_sample
+    return latents
+def decode_latents(latents, vae, output_type="pil"):
+    latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
+    with torch.no_grad():
+        images = vae.decode(latents).sample
+    images = (images / 2 + 0.5).clamp(0, 1)
+    images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+    if output_type == "pil":
+        images = (images * 255).round().astype("uint8")
+        images = [Image.fromarray(image) for image in images]
+    return images
+# Example usage:
+if __name__ == "__main__":
+    device = "cuda"
+    dtype = torch.float16
+    prompt = "кот"
+    negative_prompt = "bad quality"
+    tokenizer = AutoTokenizer.from_pretrained("visheratin/mexma-siglip")
+    text_model = AutoModel.from_pretrained(
+        "visheratin/mexma-siglip", torch_dtype=dtype, trust_remote_code=True
+    ).to(device, dtype=dtype).eval()
+    embeddings = encode_prompt(prompt, negative_prompt, device, dtype)
+    pipeid = "AiArtLab/sdxs"
+    variant = "fp16"
+    unet = UNet2DConditionModel.from_pretrained(pipeid, subfolder="unet", variant=variant).to(device, dtype=dtype).eval()
+    vae = AutoencoderKL.from_pretrained(pipeid, subfolder="vae", variant=variant).to(device, dtype=dtype).eval()
+    scheduler = DDPMScheduler.from_pretrained(pipeid, subfolder="scheduler")
+    height, width = 576, 576
+    num_inference_steps = 40
+    output_folder, project_name = "samples", "sdxs"
+    latents = generate_latents(
+        embeddings=embeddings,
+        height=height,
+        width=width,
+        num_inference_steps = num_inference_steps
+    )
+    images = decode_latents(latents, vae)
+    os.makedirs(output_folder, exist_ok=True)
+    for idx, image in enumerate(images):
+        image.save(f"{output_folder}/{project_name}_{idx}.jpg")
+    print("Images generated and saved to:", output_folder)
+```
+## Introduction
+*Fast, Lightweight & Multilingual Diffusion for Everyone*
+We are **AiArtLab**, a small team of enthusiasts with a limited budget. Our goal is to create a compact and fast model that can be trained on consumer graphics cards (full training cycle, not LoRA). We chose U-Net for its ability to efficiently handle small datasets and train quickly even on a 16GB GPU (e.g., RTX 4080). Our budget was limited to a few thousand dollars, significantly less than competitors like SDXL (tens of millions), so we decided to create a small but efficient model, similar to SD1.5 but for 2025 year.
+## Encoder Architecture (Text and Images)
+We experimented with various encoders and concluded that large models like LLaMA or T5 XXL are unnecessary for high-quality generation. However, we needed an encoder that understands the context of the query, focusing on "prompt understanding" versus "prompt following." We chose the multilingual encoder Mexma-SigLIP, which supports 80 languages and processes sentences rather than individual tokens. Mexma accepts up to 512 tokens, creating a large matrix that slows down training. Therefore, we used a pooling layer to simplify 512x1152 matrix with plain 1x1152 vector. Specifically, we passed it through a linear model/text projector to achieve compatibility with SigLIP embeddings. This allowed us to synchronize text embeddings with images, potentially leading to a unified multimodal model. This functionality enables mixing image embeddings with textual descriptions in queries. Moreover, the model can be trained without text descriptions, using only images. This should simplify training on videos, where annotation is challenging, and achieve more consistent and seamless video generation by inputting embeddings of previous frames with decay. In the future, we aim to expand the model to 3D/video generation.
+## U-Net Architecture
+We chose a smooth channel pyramid: [384, 576, 768, 960] with two layers per block and [4, 6, 8, 10] transformers with 1152/48=24 attention heads. This architecture provides the highest training speed with a model size of around 2 billion parameters (and fitting perfectly in my RTX 4080). We believe that due to its greater 'depth,' the quality will be on par with SDXL despite the smaller 'size.' The model can be expanded to 4 billion parameters by adding an 1152 layer, achieving perfect symmetry with the embedding size, which we value for its elegance, and probably 'Flux/MJ level' quality.
+## VAE Architecture
+We chose an unconventional 8x 16-channel AuraDiffusion VAE, which preserves details, text, and anatomy without the 'haze' characteristic of SD3/Flux. We used a fast version with FFN convolution, observing minor texture damage on fine patterns, which may lower its rating on benchmarks. Upscalers like ESRGAN can address these artifacts. Overall, we believe this VAE is highly underrated."
+## Training Process
+### Optimizer
+We tested several optimizers (AdamW, Laion, Optimi-AdamW, Adafactor, and AdamW-8bit) and chose AdamW-8bit. Optimi-AdamW demonstrated the smoothest gradient decay curve, although AdamW-8bit behaves more chaotically. However, its smaller size allows for larger batch sizes, maximizing training speed on low-cost GPUs (we used 4xA6000 and 5xL40s for training).
+### Learning Rate
+We found that manipulating the decay/warm-up curve has an effect but is not significant. The optimal learning rate is often overestimated. Our experiments showed that Adam allows for a wide learning rate range. We started at 1e-4, gradually decreasing to 1e-6 during training. In other words, choosing the correct model architecture is far more critical than tweaking hyperparameters.
+### Dataset
+We trained the model on approximately 1 million images: 60 epochs on ImageNet at 256 resolution (wasted time because of low-quality annotations) and 8 epochs on CaptionEmporium/midjourney-niji-1m-llavanext, plus realistic photos and anime/art at 576 resolution. We used human prompts, Caption Emporium provided prompts, WD-Tagger from SmilingWolf, and Moondream2 for annotation, varying prompt length and composition to ensure the model understands different prompting styles. The dataset is extremely small, leading the model to miss many entities and struggle with unseen concepts like 'a goose on a bicycle.' The dataset also included many waifu-style images, as we were interested in how well the model learns human anatomy rather than drawing 'The Astronaut on horseback' skills. While most descriptions were in English, our tests indicate the model is multilingual.
+## Limitations
+- Limited concept coverage due to the extremely small dataset.
+- The Image2Image functionality needs further training (we reduced the SigLIP portion to 5% to focus on text-to-image training).
+## Acknowledgments
+- **[Stan](https://t.me/Stangle)** — Key investor. Primary financial support - thank you for believing in us when others called it madness.
+- **Captainsaturnus** — Material support.
+- **Lovescape** & **Whargarbl** — Moral support.
+- **[CaptionEmporium](https://huggingface.co/CaptionEmporium)** — Datasets.
+> "We believe the future lies in efficient, compact models. We are grateful for the donations and hope for your continued support."
+## Training budget
+![budget](budget.jpg)
+## Donations
+Please contact with us if you may provide some GPU's or money on training
+DOGE: DEw2DR8C7BnF8GgcrfTzUjSnGkuMeJhg83
+BTC: 3JHv9Hb8kEW8zMAccdgCdZGfrHeMhH1rpN
+## Contacts
+[recoilme](https://t.me/recoilme)

TRAIN.md ADDED Viewed

	@@ -0,0 +1,44 @@

+---
+license: apache-2.0
+---
+Краткая инструкция по установке
+Обновите систему и установите git-lfs:
+```
+apt update
+apt install git-lfs
+git config --global credential.helper store
+```
+Обновите pip и установите требуемые пакеты:
+```
+python -m pip install --upgrade pip
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 -U
+pip install flash-attn --no-build-isolation # optional
+```
+Клонируйте репозиторий:
+```
+git clone https://huggingface.co/AiArtLab/sdxs
+cd sdxs/
+pip install -r requirements.txt
+```
+Подготовьте датасет:
+```
+mkdir datasets
+cd datasets
+huggingface-cli download AiArtLab/384 --local-dir 384 --repo-type dataset
+```
+Выполните вход в сервисы:
+```
+huggingface-cli login
+wandb login
+```
+Запустите обучение!
+```
+nohup accelerate launch train.py &
+```

budget.jpg ADDED Viewed

Git LFS Details

SHA256: 9e635ae6cd283805338a7dfa6e1bc90089d612d7f30463668034f3b256fa22a5
Pointer size: 131 Bytes
Size of remote file: 376 kB

cherrypick-vavae.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d76d1eb4ba99e75f234a07e86ef0503ebe6ddec695c61308b6d20a5a6eca9f0f
+size 16287

dataset_fromfolder.py ADDED Viewed

	@@ -0,0 +1,386 @@

+# pip install flash-attn --no-build-isolation
+from datasets import Dataset, load_from_disk, concatenate_datasets
+from diffusers import AutoencoderKL
+from torchvision.transforms import Resize, ToTensor, Normalize, Compose, InterpolationMode, Lambda
+from transformers import AutoModel, AutoImageProcessor, AutoTokenizer
+import torch
+import os
+import gc
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+import random
+import json
+import shutil
+import time
+from datetime import timedelta
+# ---------------- 1️⃣ Настройки ----------------
+dtype = torch.float16
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+batch_size = 10
+min_size = 192
+max_size = 384
+step = 64
+img_share = 0.05
+empty_share = 0.05
+limit = 0
+textemb_full = False
+# Основная процедура обработки
+folder_path = "/workspace/d3"
+save_path = "/workspace/sdxs/datasets/ds3_384"
+os.makedirs(save_path, exist_ok=True)
+# Функция для очистки CUDA памяти
+def clear_cuda_memory():
+    if torch.cuda.is_available():
+        used_gb = torch.cuda.max_memory_allocated() / 1024**3
+        print(f"used_gb: {used_gb:.2f} GB")
+        torch.cuda.empty_cache()
+        gc.collect()
+# ---------------- 2️⃣ Загрузка моделей ----------------
+def load_models():
+    print("Загрузка моделей...")
+    vae = AutoencoderKL.from_pretrained("/workspace/sdxs/vae", variant="fp16",torch_dtype=dtype).to(device).eval()
+    model = AutoModel.from_pretrained("visheratin/mexma-siglip", torch_dtype=dtype, trust_remote_code=True, optimized=True).to(device).eval()
+    processor = AutoImageProcessor.from_pretrained("visheratin/mexma-siglip", use_fast=True)
+    tokenizer = AutoTokenizer.from_pretrained("visheratin/mexma-siglip")
+    return vae, model, processor, tokenizer
+vae, model, processor, tokenizer = load_models()
+# ---------------- 3️⃣ Трансформации ----------------
+def get_image_transform(min_size=256, max_size=512, step=64):
+    def transform(img, dry_run=False):
+        # Сохраняем исходные размеры изображения
+        original_width, original_height = img.size
+        # 0. Ресайз: масштабируем изображение, чтобы максимальная сторона была равна max_size
+        if original_width >= original_height:
+            new_width = max_size
+            new_height = int(max_size * original_height / original_width)
+        else:
+            new_height = max_size
+            new_width = int(max_size * original_width / original_height)
+        if new_height < min_size or new_width < min_size:
+            # 1. Ресайз: масштабируем изображение, чтобы минимальная сторона была равна min_size
+            if original_width <= original_height:
+                new_width = min_size
+                new_height = int(min_size * original_height / original_width)
+            else:
+                new_height = min_size
+                new_width = int(min_size * original_width / original_height)
+        # 2. Проверка: если одна из сторон превышает max_size, готовимся к обрезке
+        crop_width = min(max_size, (new_width // step) * step)
+        crop_height = min(max_size, (new_height // step) * step)
+        # Убеждаемся, что размеры обрезки не меньше min_size
+        crop_width = max(min_size, crop_width)
+        crop_height = max(min_size, crop_height)
+        # Если запрошен только предварительный расчёт размеров
+        if dry_run:
+            return crop_width, crop_height
+        # Конвертация в RGB и ресайз
+        img_resized = img.convert("RGB").resize((new_width, new_height), Image.LANCZOS)
+        # Определение координат обрезки (обрезаем с учетом вотермарок - треть сверху)
+        top = (new_height - crop_height) // 3
+        left = 0
+        # Обрезка изображения
+        img_cropped = img_resized.crop((left, top, left + crop_width, top + crop_height))
+        # Сохраняем итоговые размеры после всех преобразований
+        final_width, final_height = img_cropped.size
+        # тензор
+        img_tensor = ToTensor()(img_cropped)
+        img_tensor = Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])(img_tensor)
+        return img_tensor, img_cropped, final_width, final_height
+    return transform
+# ---------------- 4️⃣ Функции обработки ----------------
+def encode_images_batch(images, processor, model):
+    pixel_values = torch.stack([processor(images=img, return_tensors="pt")["pixel_values"].squeeze(0) for img in images]).to(device, dtype)
+    with torch.inference_mode():
+        image_embeddings = model.vision_model(pixel_values).pooler_output
+    return image_embeddings.unsqueeze(1).cpu().numpy()
+def encode_texts_batch(texts, tokenizer, model):
+    with torch.inference_mode():
+        text_tokenized = tokenizer(texts, return_tensors="pt", padding="max_length",
+            max_length=512,
+            truncation=True).to(device)
+        text_embeddings = model.encode_texts(text_tokenized.input_ids, text_tokenized.attention_mask)
+    return text_embeddings.unsqueeze(1).cpu().numpy()
+def encode_texts_batch_full(texts, tokenizer, model):
+    with torch.inference_mode():
+        text_tokenized = tokenizer(texts, return_tensors="pt", padding="max_length",max_length=512,truncation=True).to(device)
+        features = model.text_model(
+            input_ids=text_tokenized.input_ids, attention_mask=text_tokenized.attention_mask
+        ).last_hidden_state
+        features_proj = model.text_projector(features)
+    return features_proj.cpu().numpy()
+def clean_label(label):
+    label = label.replace("Image 1", "").replace("Image 2", "").replace("Image 3", "").replace("Image 4", "")
+    return label
+def process_labels_for_guidance(original_labels, prob_to_make_empty=0.01):
+    """
+    Обрабатывает список меток для classifier-free guidance.
+    С вероятностью prob_to_make_empty:
+    - Метка в первом списке заменяется на пустую строку.
+    - К метке во втором списке добавляется префикс "zero:".
+    В противном случае метки в обоих списках остаются оригинальными.
+    """
+    labels_for_model = []
+    labels_for_logging = []
+    for label in original_labels:
+        if random.random() < prob_to_make_empty:
+            labels_for_model.append("")  # Заменяем на пустую строку для модели
+            labels_for_logging.append(f"zero: {label}") # Добавляем префикс для логгирования
+        else:
+            labels_for_model.append(label) # Оставляем оригинальную метку для модели
+            labels_for_logging.append(label) # Оставляем оригинальную метку для логгирования
+    return labels_for_model, labels_for_logging
+def encode_to_latents(images, texts):
+    transform = get_image_transform(min_size, max_size, step)
+    try:
+        # Обработка изображений (все одинакового размера)
+        transformed_tensors = []
+        pil_images = []
+        widths, heights = [], []
+        # Применяем трансформацию ко всем изображениям
+        for img in images:
+            try:
+                t_img, pil_img, w, h = transform(img)
+                transformed_tensors.append(t_img)
+                pil_images.append(pil_img)
+                widths.append(w)
+                heights.append(h)
+            except Exception as e:
+                print(f"Ошибка трансформации: {e}")
+                continue
+        if not transformed_tensors:
+            return None
+        # Создаём батч
+        batch_tensor = torch.stack(transformed_tensors).to(device, dtype)
+        # Кодируем батч
+        with torch.no_grad():
+            posteriors = vae.encode(batch_tensor).latent_dist.mode()
+            latents = (posteriors - vae.config.shift_factor) * vae.config.scaling_factor
+        latents_np = latents.cpu().numpy()
+        # Проверка однородности форм
+        base_shape = latents_np.shape[1:]  # Форма без батча
+        valid_indices = []
+        valid_latents = []
+        for idx, latent in enumerate(latents_np):
+            if latent.shape != base_shape:
+                print(f"❌ Несоответствие формы в индексе {idx}: {latent.shape} vs {base_shape}")
+                continue
+            valid_indices.append(idx)
+            valid_latents.append(latent)
+        # Фильтруем данные
+        valid_pil = [pil_images[i] for i in valid_indices]
+        valid_widths = [widths[i] for i in valid_indices]
+        valid_heights = [heights[i] for i in valid_indices]
+        # Обрабатываем тексты
+        text_labels = [clean_label(texts[i]) for i in valid_indices]
+        if random.random() < img_share:
+            embeddings = encode_images_batch(valid_pil, processor, model)
+            text_labels = [f"img: {text_labels[i]}" for i in valid_indices]
+        else:
+            model_prompts, text_labels = process_labels_for_guidance(text_labels, empty_share)
+            if textemb_full:
+                embeddings = encode_texts_batch_full(model_prompts, tokenizer, model)
+            else:
+                embeddings = encode_texts_batch(model_prompts, tokenizer, model)
+        return {
+            "vae": np.array(valid_latents),
+            "embeddings": embeddings,
+            "text": text_labels,
+            "width": valid_widths,
+            "height": valid_heights
+        }
+    except Exception as e:
+        print(f"Критическая ошибка в encode_to_latents: {e}")
+        raise
+# ---------------- 5️⃣ Обработка папки с изображениями и текстами ----------------
+def process_folder(folder_path, limit=None):
+    """
+    Рекурсивно обходит указанную директорию и все вложенные директории,
+    собирая пути к изображениям и соответствующим текстовым файлам.
+    """
+    image_paths = []
+    text_paths = []
+    width = []
+    height = []
+    transform = get_image_transform(min_size, max_size, step)
+    # Используем os.walk для рекурсивного обхода директорий
+    for root, dirs, files in os.walk(folder_path):
+        for filename in files:
+            # Проверяем, является ли файл изображением
+            if filename.lower().endswith((".jpg", ".jpeg", ".png")):
+                image_path = os.path.join(root, filename)
+                try:
+                    img = Image.open(image_path)
+                except Exception as e:
+                    print(f"Ошибка при открытии {image_path}: {e}")
+                    os.remove(image_path)
+                    text_path = os.path.splitext(image_path)[0] + ".txt"
+                    if os.path.exists(text_path):
+                        os.remove(text_path)
+                    continue
+                # Применяем трансформацию только для получения размеров
+                w, h = transform(img, dry_run=True)
+                # Формируем путь к текстовому файлу
+                text_path = os.path.splitext(image_path)[0] + ".txt"
+                # Добавляем пути, если текстовый файл существует
+                if os.path.exists(text_path) and min(w, h)>0:
+                    image_paths.append(image_path)
+                    text_paths.append(text_path)
+                    width.append(w)  # Добавляем в список
+                    height.append(h)  # Добавляем в список
+                    # Проверяем ограничение на количество
+                    if limit and limit>0 and len(image_paths) >= limit:
+                        print(f"Достигнут лимит в {limit} изображений")
+                        return image_paths, text_paths, width, height
+    print(f"Найдено {len(image_paths)} изображений с текстовыми описаниями")
+    return image_paths, text_paths, width, height
+def process_in_chunks(image_paths, text_paths, width, height, chunk_size=50000, batch_size=1):
+    total_files = len(image_paths)
+    start_time = time.time()
+    chunks = range(0, total_files, chunk_size)
+    for chunk_idx, start in enumerate(chunks, 1):
+        end = min(start + chunk_size, total_files)
+        chunk_image_paths = image_paths[start:end]
+        chunk_text_paths = text_paths[start:end]
+        chunk_widths = width[start:end] if isinstance(width, list) else [width] * len(chunk_image_paths)
+        chunk_heights = height[start:end] if isinstance(height, list) else [height] * len(chunk_image_paths)
+        # Чтение текстов
+        chunk_texts = []
+        for text_path in chunk_text_paths:
+            try:
+                with open(text_path, 'r', encoding='utf-8') as f:
+                    text = f.read().strip()
+                chunk_texts.append(text)
+            except Exception as e:
+                print(f"Ошибка чтения {text_path}: {e}")
+                chunk_texts.append("")
+        # Группируем изображения по размерам
+        size_groups = {}
+        for i in range(len(chunk_image_paths)):
+            size_key = (chunk_widths[i], chunk_heights[i])
+            if size_key not in size_groups:
+                size_groups[size_key] = {"image_paths": [], "texts": []}
+            size_groups[size_key]["image_paths"].append(chunk_image_paths[i])
+            size_groups[size_key]["texts"].append(chunk_texts[i])
+        # Обрабатываем каждую группу размеров отдельно
+        for size_key, group_data in size_groups.items():
+            print(f"Обработка группы с размером {size_key[0]}x{size_key[1]} - {len(group_data['image_paths'])} изображений")
+            group_dataset = Dataset.from_dict({
+                "image_path": group_data["image_paths"],
+                "text": group_data["texts"]
+            })
+            # Теперь можно использовать указанный batch_size, т.к. все изображения одного размера
+            processed_group = group_dataset.map(
+                lambda examples: encode_to_latents(
+                    [Image.open(path) for path in examples["image_path"]],
+                    examples["text"]
+                ),
+                batched=True,
+                batch_size=batch_size,
+                remove_columns=["image_path"],
+                desc=f"Обработка группы размера {size_key[0]}x{size_key[1]}"
+            )
+            # Сохраняем результаты группы
+            group_save_path = f"{save_path}_temp/chunk_{chunk_idx}_size_{size_key[0]}x{size_key[1]}"
+            processed_group.save_to_disk(group_save_path)
+            clear_cuda_memory()
+            elapsed = time.time() - start_time
+            processed = (chunk_idx - 1) * chunk_size + sum([len(sg["image_paths"]) for sg in list(size_groups.values())[:list(size_groups.values()).index(group_data) + 1]])
+            if processed > 0:
+                remaining = (elapsed / processed) * (total_files - processed)
+                elapsed_str = str(timedelta(seconds=int(elapsed)))
+                remaining_str = str(timedelta(seconds=int(remaining)))
+                print(f"ETA: Прошло {elapsed_str}, Осталось {remaining_str}, Прогресс {processed}/{total_files} ({processed/total_files:.1%})")
+# ---------------- 7️⃣ Объединение чанков ----------------
+def combine_chunks(temp_path, final_path):
+    """Объединение обработанных чанков в финальный датасет"""
+    chunks = sorted([
+        os.path.join(temp_path, d)
+        for d in os.listdir(temp_path)
+        if d.startswith("chunk_")
+    ])
+    datasets = [load_from_disk(chunk) for chunk in chunks]
+    combined = concatenate_datasets(datasets)
+    combined.save_to_disk(final_path)
+    print(f"✅ Датасет успешно сохранен в: {final_path}")
+# Создаем временную папку для чанков
+temp_path = f"{save_path}_temp"
+os.makedirs(temp_path, exist_ok=True)
+# Получаем список файлов
+image_paths, text_paths, width, height = process_folder(folder_path,limit)
+print(f"Всего найдено {len(image_paths)} изображений")
+# Обработка с чанкованием
+process_in_chunks(image_paths, text_paths, width, height, chunk_size=100000, batch_size=batch_size)
+# Объединение чанков в финальный датасет
+combine_chunks(temp_path, save_path)
+# Удаление временной папки
+try:
+    shutil.rmtree(temp_path)
+    print(f"✅ Временная папка {temp_path} успешно удалена")
+except Exception as e:
+    print(f"⚠️ Ошибка при удалении временной папки: {e}")

model_index.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85e884f29f7a6282a634d90350933aa68021326035a0980072667757c3bc9112
+size 476

pipeline_sdxs.py ADDED Viewed

	@@ -0,0 +1,295 @@

+from diffusers import DiffusionPipeline
+import torch
+import torch.nn as nn
+import os
+from diffusers.utils import BaseOutput
+from dataclasses import dataclass
+from typing import List, Union, Optional
+from PIL import Image
+import numpy as np
+import json
+from safetensors.torch import load_file
+from tqdm import tqdm
+@dataclass
+class SdxsPipelineOutput(BaseOutput):
+    images: Union[List[Image.Image], np.ndarray]
+class SdxsPipeline(DiffusionPipeline):
+    def __init__(self, vae, text_encoder, tokenizer, unet, scheduler, text_projector=None):
+        super().__init__()
+        # Register components
+        self.register_modules(
+            vae=vae, text_encoder=text_encoder, tokenizer=tokenizer,
+            unet=unet, scheduler=scheduler
+        )
+        # Get the model path, which is either provided directly or from internal dict
+        model_path = None
+        if hasattr(self, '_internal_dict') and self._internal_dict.get('_name_or_path'):
+            model_path = self._internal_dict.get('_name_or_path')
+        # Get device and dtype from existing components
+        device = "cuda"
+        dtype = torch.float16
+        # Always load text_projector, regardless of whether one was provided
+        projector_path = None
+        # Try to find projector path
+        if model_path and os.path.exists(f"{model_path}/text_projector"):
+            projector_path = f"{model_path}/text_projector"
+        elif os.path.exists("./text_projector"):
+            projector_path = "./text_projector"
+        if projector_path:
+            # Create and load projector
+            try:
+                with open(f"{projector_path}/config.json", "r") as f:
+                    projector_config = json.load(f)
+                # Create Linear layer with bias=False
+                self.text_projector = nn.Linear(
+                    in_features=projector_config["in_features"],
+                    out_features=projector_config["out_features"],
+                    bias=False
+                )
+                # Load the state dict using safetensors
+                self.text_projector.load_state_dict(load_file(f"{projector_path}/model.safetensors"))
+                self.text_projector.to(device=device, dtype=dtype)
+                print(f"Successfully loaded text_projector from {projector_path}",device, dtype)
+            except Exception as e:
+                print(f"Error loading text_projector: {e}")
+        self.vae_scale_factor = 8
+    def encode_prompt(self, prompt=None, negative_prompt=None, device=None, dtype=None):
+        """Кодирование текстовых промптов в эмбеддинги.
+        Возвращает:
+            - text_embeddings: Тензор эмбеддингов [batch_size, 1, dim] или [2*batch_size, 1, dim] с guidance
+        """
+        if prompt is None and negative_prompt is None:
+            raise ValueError("Требуется хотя бы один из параметров: prompt или negative_prompt")
+        # Устанавливаем device и dtype
+        device = device or self.device
+        dtype = dtype or next(self.unet.parameters()).dtype
+        with torch.no_grad():
+            # Обрабатываем позитивный промпт
+            if prompt is not None:
+                if isinstance(prompt, str):
+                    prompt = [prompt]
+                text_inputs = self.tokenizer(
+                    prompt, return_tensors="pt", padding="max_length",
+                    max_length=512, truncation=True
+                ).to(device)
+                # Получаем эмбеддинги
+                outputs = self.text_encoder(text_inputs.input_ids, text_inputs.attention_mask)
+                last_hidden_state = outputs.last_hidden_state.to(device, dtype=dtype)
+                pos_embeddings = self.text_projector(last_hidden_state[:, 0])
+                # Добавляем размерность для batch processing
+                if pos_embeddings.ndim == 2:
+                    pos_embeddings = pos_embeddings.unsqueeze(1)
+            else:
+                # Создаем пустые эмбеддинги, если нет позитивного промпта
+                # (полезно для некоторых сценариев с unconditional generation)
+                batch_size = len(negative_prompt) if isinstance(negative_prompt, list) else 1
+                pos_embeddings = torch.zeros(
+                    batch_size, 1, self.unet.config.cross_attention_dim,
+                    device=device, dtype=dtype
+                )
+            # Обрабатываем негативный промпт
+            if negative_prompt is not None:
+                if isinstance(negative_prompt, str):
+                    negative_prompt = [negative_prompt]
+                # Убеждаемся, что размеры негативного и позитивного промптов совпадают
+                if prompt is not None and len(negative_prompt) != len(prompt):
+                    neg_batch_size = len(prompt)
+                    if len(negative_prompt) == 1:
+                        negative_prompt = negative_prompt * neg_batch_size
+                    else:
+                        negative_prompt = negative_prompt[:neg_batch_size]
+                neg_inputs = self.tokenizer(
+                    negative_prompt, return_tensors="pt", padding="max_length",
+                    max_length=512, truncation=True
+                ).to(device)
+                neg_outputs = self.text_encoder(neg_inputs.input_ids, neg_inputs.attention_mask)
+                neg_last_hidden_state = neg_outputs.last_hidden_state.to(device, dtype=dtype)
+                neg_embeddings = self.text_projector(neg_last_hidden_state[:, 0])
+                if neg_embeddings.ndim == 2:
+                    neg_embeddings = neg_embeddings.unsqueeze(1)
+                # Объединяем для classifier-free guidance
+                text_embeddings = torch.cat([neg_embeddings, pos_embeddings], dim=0)
+            else:
+                # Если нет негативного промпта, используем нулевые эмбеддинги
+                batch_size = pos_embeddings.shape[0]
+                neg_embeddings = torch.zeros_like(pos_embeddings)
+                text_embeddings = torch.cat([neg_embeddings, pos_embeddings], dim=0)
+        return text_embeddings.to(device=device, dtype=dtype)
+    @torch.no_grad()
+    def generate_latents(
+        self,
+        text_embeddings,
+        height: int = 576,
+        width: int = 576,
+        num_inference_steps: int = 40,
+        guidance_scale: float = 5.0,
+        latent_channels: int = 16,
+        batch_size: int = 1,
+        generator = None,
+    ):
+        """Генерация латентов с использованием эмбеддингов промптов."""
+        device = self.device
+        dtype = next(self.unet.parameters()).dtype
+        # Проверка размера эмбеддингов
+        do_classifier_free_guidance = guidance_scale > 0
+        embedding_dim = text_embeddings.shape[0] // 2 if do_classifier_free_guidance else text_embeddings.shape[0]
+        if batch_size > embedding_dim:
+            # Повторяем эмбеддинги до нужного размера батча
+            if do_classifier_free_guidance:
+                neg_embeds, pos_embeds = text_embeddings.chunk(2)
+                neg_embeds = neg_embeds.repeat(batch_size // embedding_dim, 1, 1)
+                pos_embeds = pos_embeds.repeat(batch_size // embedding_dim, 1, 1)
+                text_embeddings = torch.cat([neg_embeds, pos_embeds], dim=0)
+            else:
+                text_embeddings = text_embeddings.repeat(batch_size // embedding_dim, 1, 1)
+        # Установка timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        # Инициализация латентов с заданным seed
+        latent_shape = (
+            batch_size,
+            latent_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor
+        )
+        latents = torch.randn(
+            latent_shape,
+            device=device,
+            dtype=dtype,
+            generator=generator
+        )
+        # Процесс диффузии
+        for t in tqdm(self.scheduler.timesteps, desc="Генерация"):
+            # Подготовка входных данных
+            if do_classifier_free_guidance:
+                latent_input = torch.cat([latents] * 2)
+            else:
+                latent_input = latents
+            latent_input = self.scheduler.scale_model_input(latent_input, t)
+            # Предсказание шума
+            noise_pred = self.unet(latent_input, t, text_embeddings).sample
+            # Применение guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+            # Обновление латентов
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+        return latents
+    def decode_latents(self, latents, output_type="pil"):
+        """Декодирование латентов в изображения."""
+        # Нормализация латентов
+        latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+        # Декодирование
+        with torch.no_grad():
+            images = self.vae.decode(latents).sample
+        # Нормализация изображений
+        images = (images / 2 + 0.5).clamp(0, 1)
+        # Конвертация в нужный формат
+        if output_type == "pil":
+            images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+            images = (images * 255).round().astype("uint8")
+            return [Image.fromarray(image) for image in images]
+        else:
+            return images.cpu().permute(0, 2, 3, 1).float().numpy()
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 576,
+        width: int = 576,
+        num_inference_steps: int = 40,
+        guidance_scale: float = 5.0,
+        latent_channels: int = 16,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        batch_size: int = 1,
+        seed: Optional[int] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        text_embeddings: Optional[torch.FloatTensor] = None,
+    ):
+        """Генерация изображения из текстовых промптов или эмбеддингов."""
+        device = self.device
+        # Устанавливаем генератор с seed для воспроизводимости
+        generator = None
+        if seed is not None:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        # Получаем эмбеддинги, если они не предоставлены
+        if text_embeddings is None:
+            if prompt is None and negative_prompt is None:
+                raise ValueError("Необходимо указать prompt, negative_prompt или text_embeddings")
+            # Вычисляем эмбеддинги
+            text_embeddings = self.encode_prompt(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                device=device
+            )
+        else:
+            # Убеждаемся, что эмбеддинги на правильном устройстве
+            text_embeddings = text_embeddings.to(device)
+        # Генерируем латенты
+        latents = self.generate_latents(
+            text_embeddings=text_embeddings,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            latent_channels=latent_channels,
+            batch_size=batch_size,
+            generator=generator
+        )
+        # Декодируем латенты в изображения
+        images = self.decode_latents(latents, output_type=output_type)
+        if not return_dict:
+            return images
+        return SdxsPipelineOutput(images=images)

promo.png ADDED Viewed

Git LFS Details

SHA256: 73b330e4d3677d91a81220b50c230733bf0167e536f4148d280cd79861ecc161
Pointer size: 132 Bytes
Size of remote file: 4.74 MB

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+# torch>=2.6.0
+# torchvision>=0.21.0
+# torchaudio>=2.6.0
+diffusers>=0.32.2
+accelerate>=1.5.2
+datasets>=3.5.0
+matplotlib>=3.10.1
+wandb>=0.19.8
+huggingface_hub>=0.29.3
+bitsandbytes>=0.45.4
+transformers

result_grid.jpg ADDED Viewed

Git LFS Details

SHA256: 61e4d9c26e8629fc743c1bc1aee9fe6fe7ddb995a8dd77f74f19a77c14011c62
Pointer size: 132 Bytes
Size of remote file: 6.64 MB

samples/unet_192x384_0.jpg ADDED Viewed

Git LFS Details

SHA256: 2d23201df4a727a74237b908e00a387e9f47b3147431de5f0da52a2e92676b0c
Pointer size: 130 Bytes
Size of remote file: 34.4 kB

samples/unet_256x384_0.jpg ADDED Viewed

Git LFS Details

SHA256: adf02a8971642efe76ff7ab3acdd9d3c4783f5f58a770f567c627d86abd1ea5d
Pointer size: 130 Bytes
Size of remote file: 46.9 kB

samples/unet_320x384_0.jpg ADDED Viewed

Git LFS Details

SHA256: 86179dcdab6b10dc43a4907e4966c38089d6ef30fd6253afe7b40afba0ea73a5
Pointer size: 130 Bytes
Size of remote file: 48.2 kB

samples/unet_384x192_0.jpg ADDED Viewed

Git LFS Details

SHA256: e6bd2b91ae9abede7ec2dfebf7031a353ece981b82559b65a04b1e263d9dd46b
Pointer size: 130 Bytes
Size of remote file: 38.4 kB

samples/unet_384x256_0.jpg ADDED Viewed

Git LFS Details

SHA256: 48c2cedea26fe197993fffadaa36477c077fb827850370178756bc6f15a6cfa8
Pointer size: 130 Bytes
Size of remote file: 42 kB

samples/unet_384x320_0.jpg ADDED Viewed

Git LFS Details

SHA256: 511a1a4bac01631b023e46e09b6054e8173fc0237bc04f2554cb6c107da25518
Pointer size: 130 Bytes
Size of remote file: 61.6 kB

samples/unet_384x384_0.jpg ADDED Viewed

Git LFS Details

SHA256: cbd11687741b968e66aee70793455765cfbc6a036ae30e59f14c6fd816a8da1b
Pointer size: 130 Bytes
Size of remote file: 37.2 kB

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e919ad3cde5f0bdf9529c68ee7c3306b1ceef40245778d29050d58ebc074158
+size 507

src/captions_moondream2.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd389dacb701c76713fa256b68a05972b89cf80c8d2fafef341abccbba826765
+size 4999

src/captions_moondream2_wd3.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1519f45fc46f644f46631acc4250c5b558a9d697e47548a00bb3eeedbb14e75
+size 9956

src/captions_qwen2-vl-7b.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import os
+import torch
+from PIL import Image
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+import numpy as np
+from pathlib import Path
+from tqdm import tqdm
+import argparse
+import gc
+# Configuration options
+PRINT_CAPTIONS = False  # Print captions to the console during inference
+PRINT_CAPTIONING_STATUS = False  # Print captioning file status to the console
+OVERWRITE = True  # Allow overwriting existing caption files
+PREPEND_STRING = ""  # Prefix string to prepend to the generated caption
+APPEND_STRING = ""  # Suffix string to append to the generated caption
+STRIP_LINEBREAKS = True  # Remove line breaks from generated captions before saving
+DEFAULT_SAVE_FORMAT = ".txt"  # Default format for saving captions
+# Image resizing options
+MAX_WIDTH = 512  # Set to 0 or less to ignore
+MAX_HEIGHT = 512  # Set to 0 or less to ignore
+# Generation parameters
+REPETITION_PENALTY = 1.3  # Penalty for repeating phrases, float ~1.5
+TEMPERATURE = 0.7  # Sampling temperature to control randomness
+TOP_K = 50  # Top-k sampling to limit number of potential next tokens
+# Default values for input folder, output folder, prompt, and save format
+DEFAULT_INPUT_FOLDER = Path(__file__).parent / "input"
+DEFAULT_OUTPUT_FOLDER = DEFAULT_INPUT_FOLDER
+DEFAULT_PROMPT = "In two medium sentence, caption the key aspects of this image."
+#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+# Function to parse command-line arguments
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Process images and generate captions using Qwen model.")
+    parser.add_argument("--input_folder", type=str, default=DEFAULT_INPUT_FOLDER, help="Path to the input folder containing images.")
+    parser.add_argument("--output_folder", type=str, default=DEFAULT_OUTPUT_FOLDER, help="Path to the output folder for saving captions.")
+    parser.add_argument("--prompt", type=str, default=DEFAULT_PROMPT, help="Prompt for generating the caption.")
+    parser.add_argument("--save_format", type=str, default=DEFAULT_SAVE_FORMAT, help="Format for saving captions (e.g., .txt, .md, .json).")
+    parser.add_argument("--max_width", type=int, default=MAX_WIDTH, help="Maximum width for resizing images (default: no resizing).")
+    parser.add_argument("--max_height", type=int, default=MAX_HEIGHT, help="Maximum height for resizing images (default: no resizing).")
+    parser.add_argument("--repetition_penalty", type=float, default=REPETITION_PENALTY, help="Penalty for repetition during caption generation (default: 1.10).")
+    parser.add_argument("--temperature", type=float, default=TEMPERATURE, help="Sampling temperature for generation (default: 0.7).")
+    parser.add_argument("--top_k", type=int, default=TOP_K, help="Top-k sampling during generation (default: 50).")
+    return parser.parse_args()
+# Function to ignore images that don't have output files yet
+def filter_images_without_output(input_folder, save_format):
+    images_to_caption = []
+    skipped_images = 0
+    total_images = 0
+    for root, _, files in os.walk(input_folder):
+        for file in files:
+            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
+                total_images += 1
+                image_path = os.path.join(root, file)
+                output_path = os.path.splitext(image_path)[0] + save_format
+                if not OVERWRITE and os.path.exists(output_path):
+                    skipped_images += 1
+                else:
+                    images_to_caption.append(image_path)
+    return images_to_caption, total_images, skipped_images
+# Function to save caption to a file
+def save_caption_to_file(image_path, caption, save_format):
+    txt_file_path = os.path.splitext(image_path)[0] + save_format  # Same name, but with chosen save format
+    caption = PREPEND_STRING + caption + APPEND_STRING  # Apply prepend/append strings
+    with open(txt_file_path, "w") as txt_file:
+        txt_file.write(caption)
+    if PRINT_CAPTIONING_STATUS:
+        print(f"Caption for {os.path.abspath(image_path)} saved in {save_format} format.")
+# Function to process all images recursively in a folder
+def process_images_in_folder(images_to_caption, prompt, save_format, max_width=MAX_WIDTH, max_height=MAX_HEIGHT, repetition_penalty=REPETITION_PENALTY, temperature=TEMPERATURE, top_k=TOP_K):
+    for image_path in tqdm(images_to_caption, desc="Processing Images"):
+        try:
+            # Загружаем и обрабатываем изображение
+            image = Image.open(image_path).convert("RGB")
+            image = resize_image_proportionally(image, max_width, max_height)
+            # Читаем теги
+            tags_filename = str(Path(image_path).with_suffix('.tag'))
+            with open(tags_filename, "r") as file:
+                tag_caption = file.read()
+            #prompt = f'In one medium sentence, caption the key aspects of this image. Be direct for NSFW. You may use following tags as context: {tag_caption}'
+            prompt = f'In two medium sentence, caption the key aspects of this image. Be direct for NSFW, Don\'t be shy in your expressions like pussy, anal. Add approximate age for humans like: a child,young girl or 40 years old woman. You may use following tags as context if they are relevant: {tag_caption}'
+            #print("prompt",prompt)
+            # Генерируем подпись
+            caption = qwen_caption(image, prompt, repetition_penalty, temperature, top_k)
+            save_caption_to_file(image_path, caption, save_format)
+            if PRINT_CAPTIONS:
+                print(f"Caption for {os.path.abspath(image_path)}: {caption}")
+            # Освобождаем память
+            del image, tag_caption, caption
+            torch.cuda.empty_cache()
+            gc.collect()
+        except Exception as e:
+            print(f"Error processing {os.path.abspath(image_path)}: {str(e)}")
+            torch.cuda.empty_cache()
+            gc.collect()
+# Resize the image proportionally based on max width and/or max height.
+def resize_image_proportionally(image, max_width=None, max_height=None):
+    """
+    If both max_width and max_height are provided, the image is resized to fit within both dimensions,
+    keeping the aspect ratio intact. If only one dimension is provided, the image is resized based on that dimension.
+    """
+    if (max_width is None or max_width <= 0) and (max_height is None or max_height <= 0):
+        return image  # No resizing if both dimensions are not provided or set to 0 or less
+    original_width, original_height = image.size
+    aspect_ratio = original_width / original_height
+    # Determine the new dimensions
+    if max_width and not max_height:
+        # Resize based on width
+        new_width = max_width
+        new_height = int(new_width / aspect_ratio)
+    elif max_height and not max_width:
+        # Resize based on height
+        new_height = max_height
+        new_width = int(new_height * aspect_ratio)
+    else:
+        # Resize based on both width and height, keeping the aspect ratio
+        new_width = max_width
+        new_height = max_height
+        # Adjust the dimensions proportionally to the aspect ratio
+        if new_width / aspect_ratio > new_height:
+            new_width = int(new_height * aspect_ratio)
+        else:
+            new_height = int(new_width / aspect_ratio)
+    # Resize the image using LANCZOS (equivalent to ANTIALIAS in older versions)
+    resized_image = image.resize((new_width, new_height))
+    return resized_image
+# Generate a caption for the provided image using the Ertugrul/Qwen2-VL-7B-Captioner-Relaxed model
+def qwen_caption(image, prompt, repetition_penalty=REPETITION_PENALTY, temperature=TEMPERATURE, top_k=TOP_K):
+    if not isinstance(image, Image.Image):
+        image = Image.fromarray(np.uint8(image))
+    # Prepare the conversation content, which includes the image and the text prompt
+    conversation = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                },
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+    # Apply the chat template to format the message for processing
+    text_prompt = qwen_processor.apply_chat_template(
+        conversation, add_generation_prompt=True
+    )
+    # Prepare the inputs for the model, padding as necessary and converting to tensors
+    inputs = qwen_processor(
+        text=[text_prompt],
+        images=[image],
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to("cuda")
+    with torch.no_grad():
+        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+            output_ids = qwen_model.generate(
+                **inputs,
+                max_new_tokens=384,
+                do_sample=True,
+                temperature=temperature,
+                use_cache=True,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+            )
+    # Trim the generated IDs to remove the input part from the output
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, output_ids)
+    ]
+    # Decode the trimmed output into text, skipping special tokens
+    output_text = qwen_processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
+    )
+    # Strip line breaks if the option is enabled
+    if STRIP_LINEBREAKS:
+        output_text[0] = output_text[0].replace('\n', ' ')
+    # Освобождаем память
+    del inputs, output_ids, generated_ids_trimmed
+    torch.cuda.empty_cache()
+    gc.collect()
+    return output_text[0]
+# Run the script
+if __name__ == "__main__":
+    args = parse_arguments()
+    input_folder = args.input_folder
+    output_folder = args.output_folder
+    prompt = args.prompt
+    save_format = args.save_format
+    max_width = args.max_width
+    max_height = args.max_height
+    repetition_penalty = args.repetition_penalty
+    temperature = args.temperature
+    top_k = args.top_k
+    # Define model_id
+    model_id = "Ertugrul/Qwen2-VL-7B-Captioner-Relaxed"
+    # Filter images before loading the model
+    images_to_caption, total_images, skipped_images = filter_images_without_output(input_folder, save_format)
+    # Print summary of found, skipped, and to-be-processed images
+    print(f"\nFound {total_images} image{'s' if total_images != 1 else ''}.")
+    if not OVERWRITE:
+        print(f"{skipped_images} image{'s' if skipped_images != 1 else ''} already have captions with format {save_format}, skipping.")
+    print(f"\nCaptioning {len(images_to_caption)} image{'s' if len(images_to_caption) != 1 else ''}.\n\n")
+    # Only load the model if there are images to caption
+    if len(images_to_caption) == 0:
+        print("No images to process. Exiting.\n\n")
+    else:
+        # Initialize the Ertugrul/Qwen2-VL-7B-Captioner-Relaxed model
+        qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_id, torch_dtype=torch.bfloat16, device_map="auto"
+        )
+        qwen_processor = AutoProcessor.from_pretrained(model_id)
+        # Process the images with optional resizing and caption generation
+        process_images_in_folder(
+            images_to_caption,
+            prompt,
+            save_format,
+            max_width=max_width,
+            max_height=max_height,
+            repetition_penalty=repetition_penalty,
+            temperature=temperature,
+            top_k=top_k
+        )

src/captions_wd.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b46993285995bdc52e69d82e900f239ebafd9dc924be046e80372639c8796ed8
+size 29850

src/cherrypick.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c94efa8bf16993a1d15ef5c455fadf92084dd9349fdd8a9b5a9ca66fe869f565
+size 48464

src/cuda.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cba78a89ed8649cb384e15b3b241df0a1aa35b36ca89a5453e59fc4b875ec0f1
+size 1503

src/dataset_clean.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1f0686dd3fe000d5bc00ff5d676f173793699630b0017f53530f3dcb1ec474e
+size 5085

src/dataset_combine.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+import shutil
+from datasets import load_from_disk, concatenate_datasets
+def combine_datasets(main_dataset_path, datasets_to_add):
+    """
+    Объединяет указанные датасеты с основным датасетом.
+    Args:
+        main_dataset_path (str): Путь к основному датасету, в который нужно добавить данные
+        datasets_to_add (list): Список путей к датасетам, которые нужно добавить
+    Returns:
+        Dataset: Объединенный датасет
+    """
+    # Загружаем основной датасет
+    try:
+        main_dataset = load_from_disk(main_dataset_path)
+        print(f"Загружен основной датасет: {main_dataset_path} ({len(main_dataset)} записей)")
+    except Exception as e:
+        print(f"Ошибка загрузки основного датасета: {e}")
+        return None
+    # Список всех датасетов для объединения
+    all_datasets = [main_dataset]
+    # Загружаем и добавляем все дополнительные датасеты
+    for path in datasets_to_add:
+        try:
+            ds = load_from_disk(path)
+            all_datasets.append(ds)
+            print(f"Добавлен датасет: {path} ({len(ds)} записей)")
+        except Exception as e:
+            print(f"Ошибка загрузки датасета {path}: {e}")
+    # Объединяем все датасеты
+    print(f"Объединение {len(all_datasets)} датасетов...")
+    combined = concatenate_datasets(all_datasets)
+    # Создаем временную директорию на основе имени основного датасета
+    temp_dir = f"{main_dataset_path}_temp"
+    # Удаляем временную директорию, если она уже существует
+    if os.path.exists(temp_dir):
+        shutil.rmtree(temp_dir)
+    try:
+        # Сохраняем в временную директорию
+        print(f"Сохранение во временную директорию {temp_dir}...")
+        combined.save_to_disk(temp_dir)
+        # Удаляем старую директорию и перемещаем новую на ее место
+        print(f"Обновление основного датасета...")
+        #if os.path.exists(main_dataset_path):
+        #    shutil.rmtree(main_dataset_path)
+        #shutil.copytree(temp_dir, main_dataset_path)
+        # Удаляем временную директорию после успешного копирования
+        #shutil.rmtree(temp_dir)
+        print(f"✅ Объединенный датасет ({len(combined)} записей) успешно сохранен в: {main_dataset_path}")
+    except Exception as e:
+        print(f"Ошибка при сохранении датасета: {e}")
+        print(f"Временные данные сохранены в: {temp_dir}")
+    return combined
+combine_datasets("/workspace/sdxs/datasets/384_temp", ["/workspace/sdxs/datasets/ds3_384"])

src/dataset_fromzip.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d12f225412aeed4f5ff6cd2dc23db6bcdfd944ff00fe6f0d9c2e8fe0ec426ee
+size 6167

src/dataset_imagenet.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92ea6bc9e4033a778b9e36defff6adf481baae0d8b0a3fa537313df6fb5b4472
+size 318505

src/dataset_laion_coco.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31395e7f40ef370971b523fb9d9ab56b404ca8cc1e8e932cc602beaf72140411
+size 25403

src/dataset_mjnj.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8cf317c438de242a8cc0c7d710c00ceec53e887108b081235a1fb05dae0074b0
+size 23158

src/dataset_mnist-te.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac1571369244de9ff15d4b1785e962e06521630fa1be32f0471175e42ef00630
+size 34388

src/dataset_mnist.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c644e111748cb374d2fb9fec28ef99a5ed616898100e689cd02c6ba80b3431a7
+size 33829

src/dataset_sample.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac0384c01b5ed29625df6ab7c2da36bbf9b7b9beb4ba83746eb6c00fbd6046e1
+size 1986940

src/inference.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fdead0e35dd039c20314c1f7f8579c92b2a891a310965ffdec6002fd8a78c00
+size 2147113

src/sdxs_create-vavae.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11c2151ba855c0c0fda1e58c295f56612843c5b42aecd779cdb3a03b3802b991
+size 9794

src/sdxs_create.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fd812a7bc5233c0c2e3932fe11c5ba132e5d0389a505726fa54a95c26b42edf
+size 7417

src/sdxs_create_simple.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93e0cdf53493f39cd1b8b76f41055aee1e8377e128446d03ccf524e0bb0dcd00
+size 51335

src/sdxs_create_unet.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e2dec0ba7a9a8d4aaaeaad2201dd660ec266cb887d7f8eb127ffbe8c7d80c4f
+size 35930

src/sdxs_sdxxs_transfer.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57db74be42dbf73bc551cf86b4302dd2717555280e26325e599ca89f51b4916e
+size 168192

test.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6bca9de56c7bdda4a032e2c84d15fd5dfc8108aea08fc3186f8203f428b966f8
+size 5148457

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87131a858ee394af6afae023f733cdebc36eda2ccbed27c36bc887cfae427392
+size 721

text_encoder/model.fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:107fe15da52fe6d13d877512fa36861d1100534d1b9b88015ad9fd017db095a7
+size 1119825680

text_projector/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae2f211593cd2cc736bf8617bcb0a5e6abd4db0265170de82ae03b7a6664feda
+size 83

text_projector/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e7060a387b4a6419f9d1d852759cb5b94541a1845e996f6062a07462d8b7b6a
+size 2359384

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c785abebea9ae3257b61681b4e6fd8365ceafde980c21970d001e834cf10835
+size 964

tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ffb37461c391f096759f4a9bbbc329da0f36952f88bab061fcf84940c022e98
+size 17082999

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ccf223ba3d5b3cc7fa6c3bf451f3bb40557a5c92b0aa33f63d17802ff1a96fd9
+size 1178

train-Copy1.py ADDED Viewed

	@@ -0,0 +1,789 @@

+import os
+import math
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from torch.utils.data import DataLoader, Sampler
+from torch.utils.data.distributed import DistributedSampler
+from collections import defaultdict
+from torch.optim.lr_scheduler import LambdaLR
+from diffusers import UNet2DConditionModel, AutoencoderKL, DDPMScheduler
+from accelerate import Accelerator
+from datasets import load_from_disk
+from tqdm import tqdm
+from PIL import Image,ImageOps
+import wandb
+import random
+import gc
+from accelerate.state import DistributedType
+from torch.distributed import broadcast_object_list
+from torch.utils.checkpoint import checkpoint
+from diffusers.models.attention_processor import AttnProcessor2_0
+from datetime import datetime
+import bitsandbytes as bnb
+# --------------------------- Параметры ---------------------------
+ds_path = "datasets/384"
+batch_size = 50
+base_learning_rate = 3e-5
+min_learning_rate = 3e-6
+num_epochs = 10
+num_warmup_steps = 1000
+project = "unet"
+use_wandb = True
+save_model = True
+sample_interval_share = 5 # samples/save per epoch
+fbp = False # fused backward pass
+adam8bit = True
+percentile_clipping = 97 # Lion
+torch_compile = False
+unet_gradient = True
+clip_sample = False #Scheduler
+fixed_seed = False
+shuffle = True
+dtype = torch.float32
+steps_offset = 1 # Scheduler
+limit = 0
+checkpoints_folder = ""
+mixed_precision = "no"
+accelerator = Accelerator(mixed_precision=mixed_precision)
+device = accelerator.device
+# Параметры для диффузии
+n_diffusion_steps = 50
+samples_to_generate = 12
+guidance_scale = 5
+# Папки для сохранения результатов
+generated_folder = "samples"
+os.makedirs(generated_folder, exist_ok=True)
+# Настройка seed для воспроизводимости
+current_date = datetime.now()
+seed = int(current_date.strftime("%Y%m%d"))
+if fixed_seed:
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+#torch.backends.cuda.matmul.allow_tf32 = True
+#torch.backends.cudnn.allow_tf32 = True
+# --------------------------- Параметры LoRA ---------------------------
+# pip install peft
+lora_name = "" #"nusha"  # Имя для сохранения/загрузки LoRA адаптеров
+lora_rank = 32   # Ранг LoRA (чем меньше, тем компактнее модель)
+lora_alpha = 64  # Альфа параметр LoRA, определяющий масштаб
+print("init")
+# --------------------------- Инициализация WandB ---------------------------
+if use_wandb and accelerator.is_main_process:
+    wandb.init(project=project+lora_name, config={
+        "batch_size": batch_size,
+        "base_learning_rate": base_learning_rate,
+        "num_epochs": num_epochs,
+        "fbp": fbp,
+        "adam8bit": adam8bit,
+    })
+# Включение Flash Attention 2/SDPA
+torch.backends.cuda.enable_flash_sdp(True)
+# --------------------------- Инициализация Accelerator --------------------
+gen = torch.Generator(device=device)
+gen.manual_seed(seed)
+# --------------------------- Загрузка моделей ---------------------------
+# VAE загружается на CPU для экономии GPU-памяти
+vae = AutoencoderKL.from_pretrained("vae", variant="fp16").to("cpu").eval()
+# DDPMScheduler с V_Prediction и Zero-SNR
+scheduler = DDPMScheduler(
+    num_train_timesteps=1000,       # Полный график шагов для обучения
+    prediction_type="v_prediction", # V-Prediction
+    rescale_betas_zero_snr=True,    # Включение Zero-SNR
+    clip_sample = clip_sample,
+    steps_offset = steps_offset
+)
+class DistributedResolutionBatchSampler(Sampler):
+    def __init__(self, dataset, batch_size, num_replicas, rank, shuffle=True, drop_last=True):
+        self.dataset = dataset
+        self.batch_size = max(1, batch_size // num_replicas)
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.epoch = 0
+        # Используем numpy для ускорения
+        try:
+            widths = np.array(dataset["width"])
+            heights = np.array(dataset["height"])
+        except KeyError:
+            widths = np.zeros(len(dataset))
+            heights = np.zeros(len(dataset))
+        # Создаем уникальные ключи для размеров
+        self.size_keys = np.unique(np.stack([widths, heights], axis=1), axis=0)
+        # Группируем индексы по размерам используя numpy
+        self.size_groups = {}
+        for w, h in self.size_keys:
+            mask = (widths == w) & (heights == h)
+            self.size_groups[(w, h)] = np.where(mask)[0]
+        # Предварительно вычисляем количество пол��ых батчей для каждой группы
+        self.group_num_batches = {}
+        total_batches = 0
+        for size, indices in self.size_groups.items():
+            num_full_batches = len(indices) // (self.batch_size * self.num_replicas)
+            self.group_num_batches[size] = num_full_batches
+            total_batches += num_full_batches
+        # Округляем до числа, делящегося на num_replicas
+        self.num_batches = (total_batches // self.num_replicas) * self.num_replicas
+    def __iter__(self):
+        # print(f"Rank {self.rank}: Starting iteration")
+        # Очищаем CUDA кэш перед формированием новых батчей
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        all_batches = []
+        rng = np.random.RandomState(self.epoch)
+        for size, indices in self.size_groups.items():
+            # print(f"Rank {self.rank}: Processing size {size}, {len(indices)} samples")
+            indices = indices.copy()
+            if self.shuffle:
+                rng.shuffle(indices)
+            num_full_batches = self.group_num_batches[size]
+            if num_full_batches == 0:
+                continue
+            # Берем только индексы для полных батчей
+            valid_indices = indices[:num_full_batches * self.batch_size * self.num_replicas]
+            # Reshape для быстрого разделения на батчи
+            batches = valid_indices.reshape(-1, self.batch_size * self.num_replicas)
+            # Выбираем часть для текущего GPU
+            start_idx = self.rank * self.batch_size
+            end_idx = start_idx + self.batch_size
+            gpu_batches = batches[:, start_idx:end_idx]
+            all_batches.extend(gpu_batches)
+        if self.shuffle:
+            rng.shuffle(all_batches)
+        # Синхронизируем все процессы после формирования батчей
+        accelerator.wait_for_everyone()
+        # print(f"Rank {self.rank}: Created {len(all_batches)} batches")
+        return iter(all_batches)
+    def __len__(self):
+        return self.num_batches
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+# Функция для выборки фиксированных семплов по размерам
+def get_fixed_samples_by_resolution(dataset, samples_per_group=1):
+    """Выбирает фиксированные семплы для каждого уникального разрешения"""
+    # Группируем по размерам
+    size_groups = defaultdict(list)
+    try:
+        widths = dataset["width"]
+        heights = dataset["height"]
+    except KeyError:
+        widths = [0] * len(dataset)
+        heights = [0] * len(dataset)
+    for i, (w, h) in enumerate(zip(widths, heights)):
+        size = (w, h)
+        size_groups[size].append(i)
+    # Выбираем фиксированные примеры из каждой группы
+    fixed_samples = {}
+    for size, indices in size_groups.items():
+        # Определяем сколько семплов брать из этой группы
+        n_samples = min(samples_per_group, len(indices))
+        if len(size_groups)==1:
+            n_samples = samples_to_generate
+        if n_samples == 0:
+            continue
+        # Выбираем случайные индексы
+        sample_indices = random.sample(indices, n_samples)
+        samples_data = [dataset[idx] for idx in sample_indices]
+        # Собираем данные
+        latents = torch.tensor(np.array([item["vae"] for item in samples_data])).to(device=device,dtype=dtype)
+        embeddings = torch.tensor(np.array([item["embeddings"] for item in samples_data])).to(device,dtype=dtype)
+        texts = [item["text"] for item in samples_data]
+        # Сохраняем для этого размера
+        fixed_samples[size] = (latents, embeddings, texts)
+    print(f"Создано {len(fixed_samples)} групп фиксированных семплов по разрешениям")
+    return fixed_samples
+if limit > 0:
+    dataset = load_from_disk(ds_path).select(range(limit))
+else:
+    dataset = load_from_disk(ds_path)
+def collate_fn_simple(batch):
+    # Преобразуем список в тензоры и перемещаем на девайс
+    latents = torch.tensor(np.array([item["vae"] for item in batch])).to(device,dtype=dtype)
+    embeddings = torch.tensor(np.array([item["embeddings"] for item in batch])).to(device,dtype=dtype)
+    return latents, embeddings
+def collate_fn(batch):
+    if not batch:
+        return [], []
+    # Берем эталонную форму
+    ref_vae_shape = np.array(batch[0]["vae"]).shape
+    ref_embed_shape = np.array(batch[0]["embeddings"]).shape
+    # Фильтруем
+    valid_latents = []
+    valid_embeddings = []
+    for item in batch:
+        if (np.array(item["vae"]).shape == ref_vae_shape and
+            np.array(item["embeddings"]).shape == ref_embed_shape):
+            valid_latents.append(item["vae"])
+            valid_embeddings.append(item["embeddings"])
+    # Создаем тензоры
+    latents = torch.tensor(np.array(valid_latents)).to(device,dtype=dtype)
+    embeddings = torch.tensor(np.array(valid_embeddings)).to(device,dtype=dtype)
+    return latents, embeddings
+# Используем наш ResolutionBatchSampler
+#batch_sampler = ResolutionBatchSampler(dataset, batch_size=batch_size, shuffle=True)
+#dataloader = DataLoader(dataset, batch_sampler=batch_sampler, collate_fn=collate_fn)
+# Создаем ResolutionBatchSampler на основе индексов от DistributedSampler
+batch_sampler = DistributedResolutionBatchSampler(
+        dataset=dataset,
+        batch_size=batch_size,
+        num_replicas=accelerator.num_processes,
+        rank=accelerator.process_index,
+        shuffle=shuffle
+    )
+# Создаем DataLoader
+dataloader = DataLoader(dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_simple)
+print("Total samples",len(dataloader))
+dataloader = accelerator.prepare(dataloader)
+# Инициализация переменных для возобновления обучения
+start_epoch = 0
+global_step = 0
+# Расчёт общего количества шагов
+total_training_steps = (len(dataloader) * num_epochs)
+# Get the world size
+world_size = accelerator.state.num_processes
+#print(f"World Size: {world_size}")
+# Опция загрузки модели из последнего чекпоинта (если существует)
+latest_checkpoint = os.path.join(checkpoints_folder, project)
+if os.path.isdir(latest_checkpoint):
+    print("Загружаем UNet из чекпоинта:", latest_checkpoint)
+    if dtype == torch.float32:
+        unet = UNet2DConditionModel.from_pretrained(latest_checkpoint).to(device=device,dtype=dtype)
+    else:
+        unet = UNet2DConditionModel.from_pretrained(latest_checkpoint, variant="fp16").to(device=device,dtype=dtype)
+    if unet_gradient:
+        unet.enable_gradient_checkpointing()
+    unet.set_use_memory_efficient_attention_xformers(False) # отключаем xformers
+    try:
+        unet.set_attn_processor(AttnProcessor2_0())  # Используем стандартный AttnProcessor
+    except Exception as e:
+        print(f"Ошибка при включении SDPA: {e}")
+        print("Попытка использовать enable_xformers_memory_efficient_attention.")
+        unet.set_use_memory_efficient_attention_xformers(True)
+    if hasattr(torch.backends.cuda, "flash_sdp_enabled"):
+        print(f"torch.backends.cuda.flash_sdp_enabled(): {torch.backends.cuda.flash_sdp_enabled()}")
+    if hasattr(torch.backends.cuda, "mem_efficient_sdp_enabled"):
+        print(f"torch.backends.cuda.mem_efficient_sdp_enabled(): {torch.backends.cuda.mem_efficient_sdp_enabled()}")
+    if hasattr(torch.nn.functional, "get_flash_attention_available"):
+         print(f"torch.nn.functional.get_flash_attention_available(): {torch.nn.functional.get_flash_attention_available()}")
+if torch_compile:
+    print("compiling")
+    torch.set_float32_matmul_precision('high')
+    unet = torch.compile(unet)#, mode="reduce-overhead", fullgraph=True)
+    print("compiling - ok")
+if lora_name:
+    print(f"--- Настройка LoRA через PEFT (Rank={lora_rank}, Alpha={lora_alpha}) ---")
+    from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+    from peft.tuners.lora import LoraModel
+    import os
+    # 1. Замораживаем все параметры UNet
+    unet.requires_grad_(False)
+    print("Параметры базового UNet заморожены.")
+    # 2. Создаем конфигурацию LoRA
+    lora_config = LoraConfig(
+        r=lora_rank,
+        lora_alpha=lora_alpha,
+        target_modules=["to_q", "to_k", "to_v", "to_out.0"],
+    )
+    unet.add_adapter(lora_config)
+    # 3. Оборачиваем UNet в PEFT-модель
+    from peft import get_peft_model
+    peft_unet = get_peft_model(unet, lora_config)
+    # 4. Получаем параметры для оптимизации
+    params_to_optimize = list(p for p in peft_unet.parameters() if p.requires_grad)
+    # 5. Выводим информацию о количестве параметров
+    if accelerator.is_main_process:
+        lora_params_count = sum(p.numel() for p in params_to_optimize)
+        total_params_count = sum(p.numel() for p in unet.parameters())
+        print(f"Количество обучаемых параметров (LoRA): {lora_params_count:,}")
+        print(f"Общее количество параметров UNet: {total_params_count:,}")
+    # 6. Путь для сохранения
+    lora_save_path = os.path.join("lora", lora_name)
+    os.makedirs(lora_save_path, exist_ok=True)
+    # 7. Функция для сохранения
+    def save_lora_checkpoint(model):
+        if accelerator.is_main_process:
+            print(f"Сохраняем LoRA адаптеры  в {lora_save_path}")
+            from peft.utils.save_and_load import get_peft_model_state_dict
+            # Получаем state_dict только LoRA
+            lora_state_dict = get_peft_model_state_dict(model)
+            # Сохраняем веса
+            torch.save(lora_state_dict, os.path.join(lora_save_path, "adapter_model.bin"))
+            # Сохраняем конфиг
+            model.peft_config["default"].save_pretrained(lora_save_path)
+            # SDXL must be compatible
+            from diffusers import StableDiffusionXLPipeline
+            StableDiffusionXLPipeline.save_lora_weights(lora_save_path, lora_state_dict)
+# --------------------------- Оптимизатор ---------------------------
+# Определяем параметры для оптимизации
+#unet = torch.compile(unet)
+if lora_name:
+    # Если используется LoRA, оптимизируем только параметры LoRA
+    trainable_params = [p for p in unet.parameters() if p.requires_grad]
+else:
+    # Иначе оптимизируем все параметры
+    if fbp:
+        trainable_params = list(unet.parameters())
+if fbp:
+    # [1] Создаем словарь оптимизаторов (fused backward)
+    if adam8bit:
+        optimizer_dict = {
+            p: bnb.optim.AdamW8bit(
+                [p],  # Каждый параметр получает свой оптимизатор
+                lr=base_learning_rate,
+                eps=1e-8
+            ) for p in trainable_params
+        }
+    else:
+        optimizer_dict = {
+            p: bnb.optim.Lion8bit(
+                [p],  # Каждый параметр получает свой оптимизатор
+                lr=base_learning_rate,
+                betas=(0.9, 0.97),
+                weight_decay=0.01,
+                percentile_clipping=percentile_clipping,
+            ) for p in trainable_params
+        }
+    # [2] Определяем hook для применения оптимизатора сразу после накопления градиента
+    def optimizer_hook(param):
+        optimizer_dict[param].step()
+        optimizer_dict[param].zero_grad(set_to_none=True)
+    # [3] Регистрируем hook для trainable параметров модели
+    for param in trainable_params:
+        param.register_post_accumulate_grad_hook(optimizer_hook)
+    # Подготовка через Accelerator
+    unet, optimizer = accelerator.prepare(unet, optimizer_dict)
+else:
+    if adam8bit:
+        optimizer = bnb.optim.AdamW8bit(
+            params=unet.parameters(),
+            lr=base_learning_rate,
+            betas=(0.9, 0.999),
+            eps=1e-8,
+            weight_decay=0.01
+        )
+        #from torch.optim import AdamW
+        #optimizer = AdamW(
+        #    params=unet.parameters(),
+        #    lr=base_learning_rate,
+        #    betas=(0.9, 0.999),
+        #    eps=1e-8,
+        #    weight_decay=0.01
+        #)
+    else:
+        optimizer = bnb.optim.Lion8bit(
+            params=unet.parameters(),
+            lr=base_learning_rate,
+            betas=(0.9, 0.97),
+            weight_decay=0.01,
+            percentile_clipping=percentile_clipping,
+        )
+    from transformers import get_constant_schedule_with_warmup
+    # warmup
+    num_warmup_steps = num_warmup_steps * world_size
+    #lr_scheduler = get_constant_schedule_with_warmup(
+    #    optimizer=optimizer,
+    #    num_warmup_steps=num_warmup_steps
+    #)
+    from torch.optim.lr_scheduler import LambdaLR
+    def lr_schedule(step, max_steps, base_lr, min_lr, use_decay=True):
+        # Если не используем затухание, возвращаем базовый LR
+        if not use_decay:
+            return base_lr
+        # Иначе используем линейный прогрев и косинусное затухание
+        x = step / max_steps
+        percent = 0.05
+        if x < percent:
+            # Линейный прогрев до percent% шагов
+            return min_lr + (base_lr - min_lr) * (x / percent)
+        else:
+            # Косинусное затухание
+            decay_ratio = (x - percent) / (1 - percent)
+            return min_lr + 0.5 * (base_lr - min_lr) * (1 + math.cos(math.pi * decay_ratio))
+    def custom_lr_lambda(step):
+        return lr_schedule(step, total_training_steps*world_size,
+                         base_learning_rate, min_learning_rate,
+                         (num_warmup_steps>0)) / base_learning_rate
+    lr_scheduler = LambdaLR(optimizer, lr_lambda=custom_lr_lambda)
+    unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)
+# --------------------------- Фиксированные семплы для генерации ---------------------------
+# Примеры фиксированных семплов по размерам
+fixed_samples = get_fixed_samples_by_resolution(dataset)
+@torch.compiler.disable()
+@torch.no_grad()
+def generate_and_save_samples(fixed_samples_cpu, step):
+    """
+    Генерирует семплы для каждого из разрешений и сохраняет их.
+    Args:
+        fixed_samples_cpu: Словарь, где ключи - размеры (width, height),
+                           а значения - кортежи (latents, embeddings, text) на CPU.
+        step: Текущий шаг обучения
+    """
+    original_model = None # Инициализируем, чтобы finally не ругался
+    try:
+        original_model = accelerator.unwrap_model(unet)
+        original_model = original_model.to(dtype = dtype)
+        original_model.eval()
+        vae.to(device=device, dtype=dtype)
+        vae.eval()
+        scheduler.set_timesteps(n_diffusion_steps)
+        all_generated_images = []
+        all_captions = []
+        for size, (sample_latents, sample_text_embeddings, sample_text) in fixed_samples_cpu.items():
+            width, height = size
+            sample_latents = sample_latents.to(dtype=dtype)
+            sample_text_embeddings = sample_text_embeddings.to(dtype=dtype)
+            # Инициализируем латенты случайным шумом
+            # sample_latents уже в dtype, так что noise будет создан в dtype
+            noise = torch.randn(
+                sample_latents.shape, # Используем форму от sample_latents, которые теперь на GPU и fp16
+                generator=gen,
+                device=device,
+                dtype=sample_latents.dtype
+            )
+            current_latents = noise.clone()
+            # Подготовка текстовых эмбеддингов для guidance
+            if guidance_scale > 0:
+                # empty_embeddings должны быть того же типа и на том же устройстве
+                empty_embeddings = torch.zeros_like(sample_text_embeddings, dtype=sample_text_embeddings.dtype, device=device)
+                text_embeddings_batch = torch.cat([empty_embeddings, sample_text_embeddings], dim=0)
+            else:
+                text_embeddings_batch = sample_text_embeddings
+            for t in scheduler.timesteps:
+                t_batch = t.repeat(current_latents.shape[0]).to(device) # Убедимся, что t на устройстве
+                if guidance_scale > 0:
+                    latent_model_input = torch.cat([current_latents] * 2)
+                else:
+                    latent_model_input = current_latents
+                latent_model_input_scaled = scheduler.scale_model_input(latent_model_input, t_batch)
+                # Предсказание шума (UNet)
+                noise_pred = original_model(latent_model_input_scaled, t_batch, text_embeddings_batch).sample
+                if guidance_scale > 0:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                current_latents = scheduler.step(noise_pred, t, current_latents).prev_sample
+            #print(f"current_latents Min: {current_latents.min()} Max: {current_latents.max()}")
+            # Декодирование через VAE
+            latent_for_vae = (current_latents.detach() / vae.config.scaling_factor) + vae.config.shift_factor
+            decoded = vae.decode(latent_for_vae).sample
+            # Преобразуем тензоры в PIL-изображения
+            # Для математики с изображением (нормализация) лучше перейти в fp32
+            decoded_fp32 = decoded.to(torch.float32)
+            for img_idx, img_tensor in enumerate(decoded_fp32):
+                img = (img_tensor / 2 + 0.5).clamp(0, 1).cpu().numpy().transpose(1, 2, 0)
+                # If NaNs or infs are present, print them
+                if np.isnan(img).any():
+                    print("NaNs found, saving stoped! Step:", step)
+                    save_model = False
+                pil_img = Image.fromarray((img * 255).astype("uint8"))
+                max_w_overall = max(s[0] for s in fixed_samples_cpu.keys())
+                max_h_overall = max(s[1] for s in fixed_samples_cpu.keys())
+                max_w_overall = max(255, max_w_overall)
+                max_h_overall = max(255, max_h_overall)
+                padded_img = ImageOps.pad(pil_img, (max_w_overall, max_h_overall), color='white')
+                all_generated_images.append(padded_img)
+                caption_text = sample_text[img_idx][:200] if img_idx < len(sample_text) else ""
+                all_captions.append(caption_text)
+                sample_path = f"{generated_folder}/{project}_{width}x{height}_{img_idx}.jpg"
+                pil_img.save(sample_path, "JPEG", quality=96)
+        if use_wandb and accelerator.is_main_process:
+            wandb_images = [
+                wandb.Image(img, caption=f"{all_captions[i]}")
+                for i, img in enumerate(all_generated_images)
+            ]
+            wandb.log({"generated_images": wandb_images, "global_step": step})
+    finally:
+        vae.to("cpu") # Перемещаем VAE обратно на CPU
+        original_model = original_model.to(dtype = dtype)
+        if original_model is not None:
+            del original_model
+        # Очистка переменных, которые являются тензорами и были созданы в функции
+        for var in list(locals().keys()):
+            if isinstance(locals()[var], torch.Tensor):
+                del locals()[var]
+        torch.cuda.empty_cache()
+        gc.collect()
+# --------------------------- Генерация сэмплов перед обучением ---------------------------
+if accelerator.is_main_process:
+    if save_model:
+        print("Генерация сэмплов до старта обучения...")
+        generate_and_save_samples(fixed_samples,0)
+# Модифицируем функцию сохранения модели для поддержки LoRA
+def save_checkpoint(unet,variant=""):
+    if accelerator.is_main_process:
+        if lora_name:
+            # Сохраняем только LoRA адаптеры
+            save_lora_checkpoint(unet)
+        else:
+            # Сохраняем полную модель
+            if variant!="":
+                accelerator.unwrap_model(unet.to(dtype=torch.float16)).save_pretrained(os.path.join(checkpoints_folder, f"{project}"),variant=variant)
+            else:
+                accelerator.unwrap_model(unet).save_pretrained(os.path.join(checkpoints_folder, f"{project}"))
+            unet = unet.to(dtype=dtype)
+# --------------------------- Тренировочный цикл ---------------------------
+# Для логирования среднего лосса каждые % эпохи
+if accelerator.is_main_process:
+    print(f"Total steps per GPU: {total_training_steps}")
+epoch_loss_points = []
+progress_bar = tqdm(total=total_training_steps, disable=not accelerator.is_local_main_process, desc="Training", unit="step")
+# Определяем интервал для сэмплирования и логирования в пределах эпохи (10% эпохи)
+steps_per_epoch = len(dataloader)
+sample_interval = max(1, steps_per_epoch // sample_interval_share)
+min_loss = 1.
+# Начинаем с указанной эпохи (полезно при возобновлении)
+for epoch in range(start_epoch, start_epoch + num_epochs):
+    batch_losses = []
+    batch_grads = []
+    #unet = unet.to(dtype = dtype)
+    batch_sampler.set_epoch(epoch)
+    accelerator.wait_for_everyone()
+    unet.train()
+    print("epoch:",epoch)
+    for step, (latents, embeddings) in enumerate(dataloader):
+        with accelerator.accumulate(unet):
+            if save_model == False and step == 5 :
+                used_gb = torch.cuda.max_memory_allocated() / 1024**3
+                print(f"Шаг {step}: {used_gb:.2f} GB")
+            #latents = latents.to(dtype = dtype)
+            #embeddings = embeddings.to(dtype = dtype)
+            #print(f"Latents dtype: {latents.dtype}")
+            #print(f"Embeddings dtype: {embeddings.dtype}")
+            #print(f"Noise dtype: {noise.dtype}")
+            # Forward pass
+            noise = torch.randn_like(latents, dtype=latents.dtype)
+            timesteps = torch.randint(steps_offset, scheduler.config.num_train_timesteps,
+                (latents.shape[0],), device=device).long()
+            # Добавляем шум к латентам
+            noisy_latents = scheduler.add_noise(latents, noise, timesteps)
+            # Используем целевое значение
+            model_pred = unet(noisy_latents, timesteps, embeddings).sample
+            target_pred = scheduler.get_velocity(latents, noise, timesteps)
+            # Считаем лосс
+            # Проверяем model_pred на nan/inf
+            #if torch.isnan(model_pred.float()).any() or torch.isinf(model_pred.float()).any():
+            #    print(f"Rank {accelerator.process_index}: Found nan/inf in model_pred",model_pred.float())
+            #    # Обработка nan/inf значений
+            #    model_pred = torch.nan_to_num(model_pred.float(), nan=0.0, posinf=1.0, neginf=-1.0)
+            loss = torch.nn.functional.mse_loss(model_pred, target_pred)
+            # Проверяем на nan/inf перед backward
+            if torch.isnan(loss) or torch.isinf(loss):
+                print(f"Rank {accelerator.process_index}: Found nan/inf in loss: {loss}")
+                loss = torch.zeros_like(loss)
+            # Делаем backward через Accelerator
+            accelerator.backward(loss)
+            if (global_step % 100 == 0) or (global_step % sample_interval == 0):
+                accelerator.wait_for_everyone()
+            grad = 0.0
+            if not fbp:
+                if accelerator.sync_gradients:
+                    grad = accelerator.clip_grad_norm_(unet.parameters(), 1.)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=True)
+            # Увеличиваем счетчик глобальных шагов
+            global_step += 1
+            # Обновляем прогресс-бар
+            progress_bar.update(1)
+            # Логируем метрики
+            if accelerator.is_main_process:
+                if fbp:
+                    current_lr = base_learning_rate
+                else:
+                    current_lr = lr_scheduler.get_last_lr()[0]
+                batch_losses.append(loss.detach().item())
+                batch_grads.append(grad)
+                # Логируем в Wandb
+                if use_wandb:
+                    wandb.log({
+                        "loss": loss.detach().item(),
+                        "learning_rate": current_lr,
+                        "epoch": epoch,
+                        "grad": grad,
+                        "global_step": global_step
+                    })
+                # Генерируем сэмплы с заданным интервалом
+                if global_step % sample_interval == 0:
+                    generate_and_save_samples(fixed_samples,global_step)
+                    # Выводим текущий лосс
+                    avg_loss = np.mean(batch_losses[-sample_interval:])
+                    avg_grad = torch.mean(torch.stack(batch_grads[-sample_interval:])).cpu().item()
+                    print(f"Эпоха {epoch}, шаг {global_step}, средний лосс: {avg_loss:.6f}")
+                    if save_model:
+                        if avg_loss < min_loss:
+                            min_loss = avg_loss
+                            save_checkpoint(unet,"fp16")
+                        save_checkpoint(unet)
+                    if use_wandb:
+                        wandb.log({"intermediate_loss": avg_loss})
+                        wandb.log({"intermediate_grad": avg_grad})
+    # По окончании эпохи
+    #accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        avg_epoch_loss = np.mean(batch_losses)
+        print(f"\nЭпоха {epoch} завершена. Средний лосс: {avg_epoch_loss:.6f}")
+        if use_wandb:
+            wandb.log({"epoch_loss": avg_epoch_loss, "epoch": epoch+1})
+# Завершение обучения - сохраняем финальную модель
+if accelerator.is_main_process:
+    print("Обучение завершено! Сохраняем финальную модель...")
+    # Сохраняем основную модель
+    if save_model:
+        save_checkpoint(unet)
+    print("Готово!")
+    # randomize ode timesteps
+    # input_timestep = torch.round(
+    #     F.sigmoid(torch.randn((n,), device=latents.device)), decimals=3
+    # )
+#def create_distribution(num_points, device=None):
+#    # Диапазон вероятностей на оси x
+#    x = torch.linspace(0, 1, num_points, device=device)
+    # Пользовательская функция плотности вероятности
+#    probabilities = -7.7 * ((x - 0.5) ** 2) + 2
+    # Нормализация, чтобы сумма равнялась 1
+#    probabilities /= probabilities.sum()
+#    return x, probabilities
+#def sample_from_distribution(x, probabilities, n, device=None):
+    # Выбор индексов на основе распределения вероятностей
+#    indices = torch.multinomial(probabilities, n, replacement=True)
+#    return x[indices]
+# Пример использования
+#num_points = 1000  # Количество точек в диапазоне
+#n = latents.shape[0]  # Количество временных шагов для выборки
+#x, probabilities = create_distribution(num_points, device=latents.device)
+#timesteps = sample_from_distribution(x, probabilities, n, device=latents.device)
+# Преобразование в формат, подходящий для вашего кода
+#timesteps = (timesteps * (scheduler.config.num_train_timesteps - 1)).long()