Upload torchtitan_train_config.toml with huggingface_hub
Browse files- torchtitan_train_config.toml +64 -0
torchtitan_train_config.toml
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# torchtitan Config.toml
|
2 |
+
# NOTE: this toml config is a preset for 64 A100 GPUs.
|
3 |
+
|
4 |
+
[job]
|
5 |
+
dump_folder = "./llama3_1b_c_fixed_output"
|
6 |
+
description = "Llama 3 1B training cautious"
|
7 |
+
enable_wandb = true
|
8 |
+
|
9 |
+
[profiling]
|
10 |
+
enable_profiling = true
|
11 |
+
save_traces_folder = "profile_trace"
|
12 |
+
profile_freq = 100
|
13 |
+
|
14 |
+
[metrics]
|
15 |
+
log_freq = 10
|
16 |
+
enable_tensorboard = true
|
17 |
+
save_tb_folder = "tb"
|
18 |
+
|
19 |
+
[model]
|
20 |
+
name = "llama3"
|
21 |
+
flavor = "1B"
|
22 |
+
tokenizer_path = "./assets/tokenizer/Llama-3.1-8B"
|
23 |
+
# converters = ["float8"]
|
24 |
+
|
25 |
+
[optimizer]
|
26 |
+
name = "C_AdamW"
|
27 |
+
lr = 3e-4
|
28 |
+
eps = 1e-8
|
29 |
+
|
30 |
+
[lr_scheduler]
|
31 |
+
warmup_steps = 2000 # lr scheduler warm up
|
32 |
+
|
33 |
+
[training]
|
34 |
+
local_batch_size = 4
|
35 |
+
global_batch_size = 128
|
36 |
+
seq_len = 8192
|
37 |
+
max_norm = 1.0 # grad norm clipping
|
38 |
+
steps = 20000
|
39 |
+
compile = true
|
40 |
+
dataset = "fineweb_edu_100bt"
|
41 |
+
|
42 |
+
[parallelism]
|
43 |
+
data_parallel_replicate_degree = 1
|
44 |
+
data_parallel_shard_degree = -1
|
45 |
+
tensor_parallel_degree = 1
|
46 |
+
pipeline_parallel_degree = 1
|
47 |
+
context_parallel_degree = 1
|
48 |
+
|
49 |
+
[checkpoint]
|
50 |
+
enable_checkpoint = true
|
51 |
+
folder = "./checkpoint"
|
52 |
+
interval = 500
|
53 |
+
last_save_model_only = true
|
54 |
+
export_dtype = "float32"
|
55 |
+
async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]
|
56 |
+
|
57 |
+
[activation_checkpoint]
|
58 |
+
mode = "selective" # ["none", "selective", "full"]
|
59 |
+
selective_ac_option = "op" # "int" = ac every positive int layer or 'op', ac based on ops policy
|
60 |
+
|
61 |
+
[float8]
|
62 |
+
enable_fsdp_float8_all_gather = false
|
63 |
+
precompute_float8_dynamic_scale_for_fsdp = false
|
64 |
+
filter_fqns = ["output"]
|