Saving train state of step 2000

Files changed (5) hide show

checkpoint-2000-epoch-3/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3383ce8239fd9a5346296993f4068931faff75aafbb1e863f55802be68be183b
 size 3025686376

 version https://git-lfs.github.com/spec/v1
+oid sha256:93a3ac6fa4c717512dc0856ac258d0d555a67425f9fa46cde554fe5712a0b37f
 size 3025686376

checkpoint-2000-epoch-3/model_1.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:56570ecf66e2cbf1e212810317afdc44b85396298beab22e66ff759a1116f26a
-size 4361069272

 version https://git-lfs.github.com/spec/v1
+oid sha256:28897ec4b789c0dc382a6975366fcb16206be64b6b691a60b218831c8f6af1ea
+size 4361070048

checkpoint-2000-epoch-3/optimizer.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:69da6783a2bb6483a2623217bf874ce7cd7d99e80a36638e0ffcc67bf80de6e7
 size 950951226

 version https://git-lfs.github.com/spec/v1
+oid sha256:294e12cadb7ecb51806f8ec6010da51f2efb89c17b28f09fbe6f861bb53a37b9
 size 950951226

checkpoint-2000-epoch-3/scheduler.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8dad6b56b74593b411aa2335a4636d028f73ce8d740f99b52582f884503cebaa
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:2e72412f426c58539f1dcfef4d31369e79764f60ce3a6e20df06cde830d8946e
 size 1064

run_large_training.sh CHANGED Viewed

@@ -29,8 +29,8 @@ accelerate launch run_distillation.py \
   --dataloader_num_workers 8 \
   --preprocessing_num_workers 8 \
   --ddp_timeout 7200 \
-  --dtype "float16" \
-  --attn_implementation "flash_attention_2" \
   --output_dir "./" \
   --do_train \
   --do_eval \

   --dataloader_num_workers 8 \
   --preprocessing_num_workers 8 \
   --ddp_timeout 7200 \
+  --dtype "bfloat16" \
+  --attn_implementation "sdpa" \
   --output_dir "./" \
   --do_train \
   --do_eval \