kz919 commited on
Commit
bea307a
·
verified ·
1 Parent(s): e9595ca

Upload torchtitan_train_config.toml with huggingface_hub

Browse files
Files changed (1) hide show
  1. torchtitan_train_config.toml +64 -0
torchtitan_train_config.toml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # torchtitan Config.toml
2
+ # NOTE: this toml config is a preset for 64 A100 GPUs.
3
+
4
+ [job]
5
+ dump_folder = "./llama3_1b_c_fixed_output"
6
+ description = "Llama 3 1B training cautious"
7
+ enable_wandb = true
8
+
9
+ [profiling]
10
+ enable_profiling = true
11
+ save_traces_folder = "profile_trace"
12
+ profile_freq = 100
13
+
14
+ [metrics]
15
+ log_freq = 10
16
+ enable_tensorboard = true
17
+ save_tb_folder = "tb"
18
+
19
+ [model]
20
+ name = "llama3"
21
+ flavor = "1B"
22
+ tokenizer_path = "./assets/tokenizer/Llama-3.1-8B"
23
+ # converters = ["float8"]
24
+
25
+ [optimizer]
26
+ name = "C_AdamW"
27
+ lr = 3e-4
28
+ eps = 1e-8
29
+
30
+ [lr_scheduler]
31
+ warmup_steps = 2000 # lr scheduler warm up
32
+
33
+ [training]
34
+ local_batch_size = 4
35
+ global_batch_size = 128
36
+ seq_len = 8192
37
+ max_norm = 1.0 # grad norm clipping
38
+ steps = 20000
39
+ compile = true
40
+ dataset = "fineweb_edu_100bt"
41
+
42
+ [parallelism]
43
+ data_parallel_replicate_degree = 1
44
+ data_parallel_shard_degree = -1
45
+ tensor_parallel_degree = 1
46
+ pipeline_parallel_degree = 1
47
+ context_parallel_degree = 1
48
+
49
+ [checkpoint]
50
+ enable_checkpoint = true
51
+ folder = "./checkpoint"
52
+ interval = 500
53
+ last_save_model_only = true
54
+ export_dtype = "float32"
55
+ async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]
56
+
57
+ [activation_checkpoint]
58
+ mode = "selective" # ["none", "selective", "full"]
59
+ selective_ac_option = "op" # "int" = ac every positive int layer or 'op', ac based on ops policy
60
+
61
+ [float8]
62
+ enable_fsdp_float8_all_gather = false
63
+ precompute_float8_dynamic_scale_for_fsdp = false
64
+ filter_fqns = ["output"]