recoilme commited on
Commit
2f0548e
·
1 Parent(s): 6221858
2b/diffusion_pytorch_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42851d6484909485ba508da8b9605eb46fc3ccb39e73d6b9f00e84bc0c7980a2
3
  size 7993399544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33928b6c0aaa936c3f2f72ae338c4c3712c4b6fc97d3bf7ba2ba9b8ac116055e
3
  size 7993399544
samples/2b_192x384_0.jpg CHANGED

Git LFS Details

  • SHA256: 6944b77a90142950be4b81e84672e26fbb9f96e4a084436a3764af4f7eaf6095
  • Pointer size: 130 Bytes
  • Size of remote file: 56.9 kB

Git LFS Details

  • SHA256: 28a41871421609e74dc2c62f849a7b238012e61bf2ea9fe1b46eb3701828ae5e
  • Pointer size: 130 Bytes
  • Size of remote file: 27.6 kB
samples/2b_256x384_0.jpg CHANGED

Git LFS Details

  • SHA256: 67f02c0abb15ae99ae78f3282e936388d59890436a8e64e70dd50d9b40b8c548
  • Pointer size: 130 Bytes
  • Size of remote file: 42.3 kB

Git LFS Details

  • SHA256: e75bc4add640ba8d5b4befad448bfe65c2a91ee9157d3e4f8a29bb02dade7e8a
  • Pointer size: 130 Bytes
  • Size of remote file: 59.8 kB
samples/2b_320x384_0.jpg CHANGED

Git LFS Details

  • SHA256: 974ac15500eab263ad53d4452c5a970ae5640e1d48947d2813d11465150a4233
  • Pointer size: 130 Bytes
  • Size of remote file: 88.6 kB

Git LFS Details

  • SHA256: a03435892f1dabf8d5693d7b7cbf3d97497bd9f4c73266ffa68cee1d7ed5b152
  • Pointer size: 130 Bytes
  • Size of remote file: 52.1 kB
samples/2b_384x192_0.jpg CHANGED

Git LFS Details

  • SHA256: a93196e24d75aa30c1dc7b65413263b3d30ed2722656070adefc92af52ea3c9b
  • Pointer size: 130 Bytes
  • Size of remote file: 36 kB

Git LFS Details

  • SHA256: c5d23232196a42123d6ad33f1a22cb0becabf766079a4b1e401c9ec9b490a4d4
  • Pointer size: 130 Bytes
  • Size of remote file: 32.6 kB
samples/2b_384x256_0.jpg CHANGED

Git LFS Details

  • SHA256: 75b2668118eef096d72a40cf3745396829f45f8088c29da8c855ecd1a7badf26
  • Pointer size: 130 Bytes
  • Size of remote file: 32.1 kB

Git LFS Details

  • SHA256: 76ac2559a67e135f40214437e9402b629c471d2936372c565d96c505b86e04bc
  • Pointer size: 130 Bytes
  • Size of remote file: 27.7 kB
samples/2b_384x320_0.jpg CHANGED

Git LFS Details

  • SHA256: 8c475062acdeade2619acd239a19542dae74dba82437567ad9be05ba260db9df
  • Pointer size: 130 Bytes
  • Size of remote file: 71.4 kB

Git LFS Details

  • SHA256: 006ba80ec5c8fb610f02911f1a7161d2f77b9f628e39551620f8d59bf67b43be
  • Pointer size: 130 Bytes
  • Size of remote file: 46.9 kB
samples/2b_384x384_0.jpg CHANGED

Git LFS Details

  • SHA256: e1fdb577583fd1c079d2df758722971aac90a855781d80ade94e89f11d76ba35
  • Pointer size: 130 Bytes
  • Size of remote file: 85.6 kB

Git LFS Details

  • SHA256: 0cd3ca05833375a7c43901c4809f4cd07b392aa11026e06b19f9303165203cdd
  • Pointer size: 130 Bytes
  • Size of remote file: 41.9 kB
train.py CHANGED
@@ -27,10 +27,10 @@ import torch.nn.functional as F
27
  # --------------------------- Параметры ---------------------------
28
  ds_path = "datasets/384"
29
  project = "2b"
30
- batch_size = 32
31
  base_learning_rate = 8e-5
32
  min_learning_rate = 4e-5
33
- num_epochs = 27
34
  # samples/save per epoch
35
  sample_interval_share = 5
36
  use_wandb = True
@@ -43,7 +43,7 @@ unet_gradient = True
43
  clip_sample = False #Scheduler
44
  fixed_seed = False
45
  shuffle = True
46
- dispersive_loss_enabled = False
47
  torch.backends.cuda.matmul.allow_tf32 = True
48
  torch.backends.cudnn.allow_tf32 = True
49
  torch.backends.cuda.enable_mem_efficient_sdp(False)
@@ -59,7 +59,7 @@ steps_offset = 1 # Scheduler
59
  limit = 0
60
  checkpoints_folder = ""
61
  mixed_precision = "bf16" #"fp16"
62
- gradient_accumulation_steps = 2
63
  accelerator = Accelerator(
64
  mixed_precision=mixed_precision,
65
  gradient_accumulation_steps=gradient_accumulation_steps
@@ -522,6 +522,15 @@ else:
522
  (1 + math.cos(math.pi * decay_ratio))
523
 
524
  lr_scheduler = LambdaLR(optimizer, lambda step: lr_schedule(step) / base_learning_rate)
 
 
 
 
 
 
 
 
 
525
  unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)
526
 
527
  # Регистрация хуков ПОСЛЕ prepare
 
27
  # --------------------------- Параметры ---------------------------
28
  ds_path = "datasets/384"
29
  project = "2b"
30
+ batch_size = 64
31
  base_learning_rate = 8e-5
32
  min_learning_rate = 4e-5
33
+ num_epochs = 25
34
  # samples/save per epoch
35
  sample_interval_share = 5
36
  use_wandb = True
 
43
  clip_sample = False #Scheduler
44
  fixed_seed = False
45
  shuffle = True
46
+ dispersive_loss_enabled = True
47
  torch.backends.cuda.matmul.allow_tf32 = True
48
  torch.backends.cudnn.allow_tf32 = True
49
  torch.backends.cuda.enable_mem_efficient_sdp(False)
 
59
  limit = 0
60
  checkpoints_folder = ""
61
  mixed_precision = "bf16" #"fp16"
62
+ gradient_accumulation_steps = 1
63
  accelerator = Accelerator(
64
  mixed_precision=mixed_precision,
65
  gradient_accumulation_steps=gradient_accumulation_steps
 
522
  (1 + math.cos(math.pi * decay_ratio))
523
 
524
  lr_scheduler = LambdaLR(optimizer, lambda step: lr_schedule(step) / base_learning_rate)
525
+
526
+ num_params = sum(p.numel() for p in unet.parameters())
527
+ print(f"[rank {accelerator.process_index}] total params: {num_params}")
528
+ # Проверка на NaN/Inf
529
+ for name, param in unet.named_parameters():
530
+ if torch.isnan(param).any() or torch.isinf(param).any():
531
+ print(f"[rank {accelerator.process_index}] NaN/Inf in {name}")
532
+ # Опционально: заменить на нормальные значения
533
+ #param.data = torch.randn_like(param) * 0.01
534
  unet, optimizer, lr_scheduler = accelerator.prepare(unet, optimizer, lr_scheduler)
535
 
536
  # Регистрация хуков ПОСЛЕ prepare