Saving train state of step 2000
Browse files
checkpoint-2000-epoch-3/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 3025686376
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:93a3ac6fa4c717512dc0856ac258d0d555a67425f9fa46cde554fe5712a0b37f
|
3 |
size 3025686376
|
checkpoint-2000-epoch-3/model_1.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28897ec4b789c0dc382a6975366fcb16206be64b6b691a60b218831c8f6af1ea
|
3 |
+
size 4361070048
|
checkpoint-2000-epoch-3/optimizer.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 950951226
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:294e12cadb7ecb51806f8ec6010da51f2efb89c17b28f09fbe6f861bb53a37b9
|
3 |
size 950951226
|
checkpoint-2000-epoch-3/scheduler.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e72412f426c58539f1dcfef4d31369e79764f60ce3a6e20df06cde830d8946e
|
3 |
size 1064
|
run_large_training.sh
CHANGED
@@ -29,8 +29,8 @@ accelerate launch run_distillation.py \
|
|
29 |
--dataloader_num_workers 8 \
|
30 |
--preprocessing_num_workers 8 \
|
31 |
--ddp_timeout 7200 \
|
32 |
-
--dtype "
|
33 |
-
--attn_implementation "
|
34 |
--output_dir "./" \
|
35 |
--do_train \
|
36 |
--do_eval \
|
|
|
29 |
--dataloader_num_workers 8 \
|
30 |
--preprocessing_num_workers 8 \
|
31 |
--ddp_timeout 7200 \
|
32 |
+
--dtype "bfloat16" \
|
33 |
+
--attn_implementation "sdpa" \
|
34 |
--output_dir "./" \
|
35 |
--do_train \
|
36 |
--do_eval \
|