2507
Browse files- 2b/diffusion_pytorch_model.safetensors +1 -1
- samples/2b_192x384_0.jpg +2 -2
- samples/2b_256x384_0.jpg +2 -2
- samples/2b_320x384_0.jpg +2 -2
- samples/2b_384x192_0.jpg +2 -2
- samples/2b_384x256_0.jpg +2 -2
- samples/2b_384x320_0.jpg +2 -2
- samples/2b_384x384_0.jpg +2 -2
- train.py +13 -4
2b/diffusion_pytorch_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 7993399544
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:33928b6c0aaa936c3f2f72ae338c4c3712c4b6fc97d3bf7ba2ba9b8ac116055e
|
3 |
size 7993399544
|
samples/2b_192x384_0.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
samples/2b_256x384_0.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
samples/2b_320x384_0.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
samples/2b_384x192_0.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
samples/2b_384x256_0.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
samples/2b_384x320_0.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
samples/2b_384x384_0.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
train.py
CHANGED
@@ -27,10 +27,10 @@ import torch.nn.functional as F
|
|
27 |
# --------------------------- Параметры ---------------------------
|
28 |
ds_path = "datasets/384"
|
29 |
project = "2b"
|
30 |
-
batch_size =
|
31 |
base_learning_rate = 8e-5
|
32 |
min_learning_rate = 4e-5
|
33 |
-
num_epochs =
|
34 |
# samples/save per epoch
|
35 |
sample_interval_share = 5
|
36 |
use_wandb = True
|
@@ -43,7 +43,7 @@ unet_gradient = True
|
|
43 |
clip_sample = False #Scheduler
|
44 |
fixed_seed = False
|
45 |
shuffle = True
|
46 |
-
dispersive_loss_enabled =
|
47 |
torch.backends.cuda.matmul.allow_tf32 = True
|
48 |
torch.backends.cudnn.allow_tf32 = True
|
49 |
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
@@ -59,7 +59,7 @@ steps_offset = 1 # Scheduler
|
|
59 |
limit = 0
|
60 |
checkpoints_folder = ""
|
61 |
mixed_precision = "bf16" #"fp16"
|
62 |
-
gradient_accumulation_steps =
|
63 |
accelerator = Accelerator(
|
64 |
mixed_precision=mixed_precision,
|
65 |
gradient_accumulation_steps=gradient_accumulation_steps
|
@@ -522,6 +522,15 @@ else:
|
|
522 |
(1 + math.cos(math.pi * decay_ratio))
|
523 |
|
524 |
lr_scheduler = LambdaLR(optimizer, lambda step: lr_schedule(step) / base_learning_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
525 |
unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)
|
526 |
|
527 |
# Регистрация хуков ПОСЛЕ prepare
|
|
|
27 |
# --------------------------- Параметры ---------------------------
|
28 |
ds_path = "datasets/384"
|
29 |
project = "2b"
|
30 |
+
batch_size = 64
|
31 |
base_learning_rate = 8e-5
|
32 |
min_learning_rate = 4e-5
|
33 |
+
num_epochs = 25
|
34 |
# samples/save per epoch
|
35 |
sample_interval_share = 5
|
36 |
use_wandb = True
|
|
|
43 |
clip_sample = False #Scheduler
|
44 |
fixed_seed = False
|
45 |
shuffle = True
|
46 |
+
dispersive_loss_enabled = True
|
47 |
torch.backends.cuda.matmul.allow_tf32 = True
|
48 |
torch.backends.cudnn.allow_tf32 = True
|
49 |
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
|
|
59 |
limit = 0
|
60 |
checkpoints_folder = ""
|
61 |
mixed_precision = "bf16" #"fp16"
|
62 |
+
gradient_accumulation_steps = 1
|
63 |
accelerator = Accelerator(
|
64 |
mixed_precision=mixed_precision,
|
65 |
gradient_accumulation_steps=gradient_accumulation_steps
|
|
|
522 |
(1 + math.cos(math.pi * decay_ratio))
|
523 |
|
524 |
lr_scheduler = LambdaLR(optimizer, lambda step: lr_schedule(step) / base_learning_rate)
|
525 |
+
|
526 |
+
num_params = sum(p.numel() for p in unet.parameters())
|
527 |
+
print(f"[rank {accelerator.process_index}] total params: {num_params}")
|
528 |
+
# Проверка на NaN/Inf
|
529 |
+
for name, param in unet.named_parameters():
|
530 |
+
if torch.isnan(param).any() or torch.isinf(param).any():
|
531 |
+
print(f"[rank {accelerator.process_index}] NaN/Inf in {name}")
|
532 |
+
# Опционально: заменить на нормальные значения
|
533 |
+
#param.data = torch.randn_like(param) * 0.01
|
534 |
unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)
|
535 |
|
536 |
# Регистрация хуков ПОСЛЕ prepare
|