wissamantoun commited on
Commit
266f822
·
verified ·
1 Parent(s): ce44261

Upload 3 files

Browse files
camembertv2-base-p2.yaml ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_local: /scratch/playground/data/bin_v2old_data_long/
2
+ data_remote: # If blank, files must be present in data_local
3
+
4
+ max_seq_len: 8192
5
+ tokenizer_name: tokenizer/camembertv2
6
+ mlm_probability: 0.3 # FlexBERT should use 30% masking for optimal performance
7
+ count_padding_tokens: false
8
+ reset_time: true
9
+ restart_override: true
10
+ reset_dataloader: true
11
+
12
+ # Run Name
13
+ run_name: camembertv2-base-p2
14
+ pretrain_run_name: camembertv2-base
15
+ load_path: /scratch/playground/checkpoints/${pretrain_run_name}/latest-rank0.pt
16
+
17
+ # Model
18
+ model:
19
+ name: flex_bert
20
+ pretrained_model_name: configs/bert-base-uncased
21
+ tokenizer_name: ${tokenizer_name}
22
+ disable_train_metrics: true
23
+ # FlexBERT 'base' generally uses the default architecture values from the Hugging Face BertConfig object
24
+ # Note: if using the pretrained_checkpoint argument to create a model from an existing checkpoint, make sure
25
+ # the model_config settings match the architecture of the existing model
26
+ model_config:
27
+ vocab_size: 32768
28
+ init_method: full_megatron
29
+ num_hidden_layers: 22
30
+ hidden_size: 768
31
+ intermediate_size: 1152
32
+ num_attention_heads: 12 # to have head size of 64
33
+ attention_layer: rope
34
+ attention_probs_dropout_prob: 0.0
35
+ attn_out_bias: false
36
+ attn_out_dropout_prob: 0.1
37
+ attn_qkv_bias: false
38
+ bert_layer: prenorm
39
+ embed_dropout_prob: 0.0
40
+ embed_norm: true
41
+ final_norm: true
42
+ skip_first_prenorm: true
43
+ embedding_layer: sans_pos
44
+ loss_function: fa_cross_entropy
45
+ loss_kwargs:
46
+ reduction: mean
47
+ mlp_dropout_prob: 0.0
48
+ mlp_in_bias: false
49
+ mlp_layer: glu
50
+ mlp_out_bias: false
51
+ normalization: layernorm
52
+ norm_kwargs:
53
+ eps: 1e-5
54
+ bias: false
55
+ hidden_act: gelu
56
+ head_pred_act: gelu
57
+ activation_function: gelu # better safe than sorry
58
+ padding: unpadded
59
+ rotary_emb_dim: null
60
+ rotary_emb_base: 160000.0
61
+ rotary_emb_scale_base: null
62
+ rotary_emb_interleaved: false
63
+ local_attn_rotary_emb_base: 10000.0
64
+ allow_embedding_resizing: true
65
+ sliding_window: 128
66
+ global_attn_every_n_layers: 3
67
+ unpad_embeddings: true
68
+ compile_model: true
69
+ masked_prediction: true
70
+
71
+ # Dataloaders
72
+ train_loader:
73
+ name: litdata_tokenized
74
+ dataset:
75
+ local: ${data_local}
76
+ remote: ${data_remote}
77
+ split: train
78
+ tokenizer_name: ${tokenizer_name}
79
+ max_seq_len: ${max_seq_len}
80
+ shuffle: true
81
+ mlm_probability: ${mlm_probability}
82
+ streaming: false
83
+ shuffle_seed: ${seed}
84
+ seed: ${seed}
85
+ drop_last: true
86
+ num_workers: 12
87
+ sequence_packing: true
88
+ # batch_size_warmup_min_size: 96
89
+ # batch_size_warmup_tokens: 50_000_000_000tok
90
+
91
+ eval_loader:
92
+ name: litdata_tokenized
93
+ dataset:
94
+ local: ${data_local}
95
+ remote: ${data_remote}
96
+ split: validation
97
+ tokenizer_name: ${tokenizer_name}
98
+ max_seq_len: ${max_seq_len}
99
+ shuffle: false
100
+ mlm_probability: 0.15 # We always evaluate at 15% masking for consistent comparison
101
+ streaming: false
102
+ seed: ${seed}
103
+ drop_last: false
104
+ num_workers: 12
105
+ sequence_packing: false
106
+
107
+ # Optimization
108
+ scheduler:
109
+ name: warmup_stable_decay
110
+ t_warmup: 0tok
111
+ alpha_f: 0.00 # Linearly decay to 0.02x the full LR by the end of the training duration
112
+ t_decay: 0tok
113
+
114
+ optimizer:
115
+ name: decoupled_adamw
116
+ lr: 3e-4 # Peak learning rate
117
+ betas:
118
+ - 0.9
119
+ - 0.98
120
+ eps: 1.0e-06
121
+ weight_decay: 1.0e-5 # Amount of weight decay regularization
122
+ filter_bias_norm_wd: true # If True, doesn't apply weight decay to norm layers and biases
123
+ log_grad_norm: true
124
+
125
+ max_duration: 150_000_000_000tok
126
+ eval_interval: 5000ba
127
+ global_train_batch_size: 768
128
+ global_eval_batch_size: 6144
129
+
130
+ # System
131
+ seed: 25
132
+ device_eval_batch_size: 32
133
+ device_train_microbatch_size: 8
134
+ precision: amp_bf16
135
+
136
+ # Logging
137
+ progress_bar: false
138
+ log_to_console: true
139
+ console_log_interval: 100ba
140
+
141
+ callbacks:
142
+ runtime_estimator: {}
143
+ dataloader_speed: {}
144
+ speed_monitor:
145
+ window_size: 100
146
+ lr_monitor: {}
147
+ scheduled_gc: {}
148
+ # log_grad_norm:
149
+ # batch_log_interval: 100
150
+ packing_efficiency:
151
+ log_interval: 100
152
+
153
+ loggers:
154
+ # wandb:
155
+ # project: fr_modernbert
156
+ # entity: wissam
157
+ tensorboard:
158
+ log_dir: /scratch/playground/logs/tensorboard/p2/
159
+
160
+ autoresume: true
161
+ # Checkpoint to local filesystem or remote object store
162
+ save_interval: 5000ba
163
+ save_num_checkpoints_to_keep: -1 # Important, this cleans up checkpoints saved to DISK
164
+ save_folder: /scratch/playground/checkpoints/{run_name}
165
+ # Load from local filesystem or remote object store to
166
+ # load_path: null
167
+
168
+ parallelism_config:
169
+ fsdp:
170
+ sharding_strategy: "FULL_SHARD"
171
+ state_dict_type: "sharded"
172
+ # mixed_precision:
173
+ # param_dtype: bf16
174
+ # reduce_dtype: bf16
175
+ # buffer_dtype: bf16
camembertv2-base-p3.yaml ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_local: /scratch/playground/data/bin_v2old_data_high/
2
+ data_remote: # If blank, files must be present in data_local
3
+
4
+ max_seq_len: 8192
5
+ tokenizer_name: tokenizer/camembertv2
6
+ mlm_probability: 0.3 # FlexBERT should use 30% masking for optimal performance
7
+ count_padding_tokens: false
8
+ reset_time: true
9
+ restart_override: true
10
+ reset_dataloader: true
11
+
12
+ # Run Name
13
+ run_name: camembertv2-base-p3
14
+ pretrain_run_name: camembertv2-base-p2
15
+ load_path: /scratch/playground/checkpoints/${pretrain_run_name}/latest-rank0.pt
16
+
17
+ # Model
18
+ model:
19
+ name: flex_bert
20
+ pretrained_model_name: configs/bert-base-uncased
21
+ tokenizer_name: ${tokenizer_name}
22
+ disable_train_metrics: true
23
+ # FlexBERT 'base' generally uses the default architecture values from the Hugging Face BertConfig object
24
+ # Note: if using the pretrained_checkpoint argument to create a model from an existing checkpoint, make sure
25
+ # the model_config settings match the architecture of the existing model
26
+ model_config:
27
+ vocab_size: 32768
28
+ init_method: full_megatron
29
+ num_hidden_layers: 22
30
+ hidden_size: 768
31
+ intermediate_size: 1152
32
+ num_attention_heads: 12 # to have head size of 64
33
+ attention_layer: rope
34
+ attention_probs_dropout_prob: 0.0
35
+ attn_out_bias: false
36
+ attn_out_dropout_prob: 0.1
37
+ attn_qkv_bias: false
38
+ bert_layer: prenorm
39
+ embed_dropout_prob: 0.0
40
+ embed_norm: true
41
+ final_norm: true
42
+ skip_first_prenorm: true
43
+ embedding_layer: sans_pos
44
+ loss_function: fa_cross_entropy
45
+ loss_kwargs:
46
+ reduction: mean
47
+ mlp_dropout_prob: 0.0
48
+ mlp_in_bias: false
49
+ mlp_layer: glu
50
+ mlp_out_bias: false
51
+ normalization: layernorm
52
+ norm_kwargs:
53
+ eps: 1e-5
54
+ bias: false
55
+ hidden_act: gelu
56
+ head_pred_act: gelu
57
+ activation_function: gelu # better safe than sorry
58
+ padding: unpadded
59
+ rotary_emb_dim: null
60
+ rotary_emb_base: 160000.0
61
+ rotary_emb_scale_base: null
62
+ rotary_emb_interleaved: false
63
+ local_attn_rotary_emb_base: 10000.0
64
+ allow_embedding_resizing: true
65
+ sliding_window: 128
66
+ global_attn_every_n_layers: 3
67
+ unpad_embeddings: true
68
+ compile_model: true
69
+ masked_prediction: true
70
+
71
+ # Dataloaders
72
+ train_loader:
73
+ name: litdata_tokenized
74
+ dataset:
75
+ local: ${data_local}
76
+ remote: ${data_remote}
77
+ split: train
78
+ tokenizer_name: ${tokenizer_name}
79
+ max_seq_len: ${max_seq_len}
80
+ shuffle: true
81
+ mlm_probability: ${mlm_probability}
82
+ streaming: false
83
+ shuffle_seed: ${seed}
84
+ seed: ${seed}
85
+ drop_last: true
86
+ num_workers: 12
87
+ sequence_packing: true
88
+ # batch_size_warmup_min_size: 96
89
+ # batch_size_warmup_tokens: 50_000_000_000tok
90
+
91
+ eval_loader:
92
+ name: litdata_tokenized
93
+ dataset:
94
+ local: ${data_local}
95
+ remote: ${data_remote}
96
+ split: validation
97
+ tokenizer_name: ${tokenizer_name}
98
+ max_seq_len: ${max_seq_len}
99
+ shuffle: false
100
+ mlm_probability: 0.15 # We always evaluate at 15% masking for consistent comparison
101
+ streaming: false
102
+ seed: ${seed}
103
+ drop_last: false
104
+ num_workers: 12
105
+ sequence_packing: false
106
+
107
+ # Optimization
108
+ scheduler:
109
+ name: one_minus_sqrt
110
+ t_decay: 50_000_000_000tok
111
+ alpha_f: 0.00 # Linearly decay to 0.02x the full LR by the end of the training duration
112
+ t_max: 100_000_000_000tok
113
+
114
+ optimizer:
115
+ name: decoupled_adamw
116
+ lr: 3e-4 # Peak learning rate
117
+ betas:
118
+ - 0.9
119
+ - 0.98
120
+ eps: 1.0e-06
121
+ weight_decay: 1.0e-5 # Amount of weight decay regularization
122
+ filter_bias_norm_wd: true # If True, doesn't apply weight decay to norm layers and biases
123
+ log_grad_norm: true
124
+
125
+ max_duration: 100_000_000_000tok
126
+ eval_interval: 5000ba
127
+ global_train_batch_size: 768
128
+ global_eval_batch_size: 6144
129
+
130
+ # System
131
+ seed: 25
132
+ device_eval_batch_size: 32
133
+ device_train_microbatch_size: 8
134
+ precision: amp_bf16
135
+
136
+ # Logging
137
+ progress_bar: false
138
+ log_to_console: true
139
+ console_log_interval: 100ba
140
+
141
+ callbacks:
142
+ runtime_estimator: {}
143
+ dataloader_speed: {}
144
+ speed_monitor:
145
+ window_size: 100
146
+ lr_monitor: {}
147
+ scheduled_gc: {}
148
+ # log_grad_norm:
149
+ # batch_log_interval: 100
150
+ packing_efficiency:
151
+ log_interval: 100
152
+
153
+ loggers:
154
+ # wandb:
155
+ # project: fr_modernbert
156
+ # entity: wissam
157
+ tensorboard:
158
+ log_dir: /scratch/playground/logs/tensorboard/p3/
159
+
160
+ autoresume: true
161
+ # Checkpoint to local filesystem or remote object store
162
+ save_interval: 5000ba
163
+ save_num_checkpoints_to_keep: -1 # Important, this cleans up checkpoints saved to DISK
164
+ save_folder: /scratch/playground/checkpoints/{run_name}
165
+ # Load from local filesystem or remote object store to
166
+ # load_path: null
167
+
168
+ parallelism_config:
169
+ fsdp:
170
+ sharding_strategy: "FULL_SHARD"
171
+ state_dict_type: "sharded"
172
+ # mixed_precision:
173
+ # param_dtype: bf16
174
+ # reduce_dtype: bf16
175
+ # buffer_dtype: bf16
camembertv2-base.yaml ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_local: /scratch/playground/data/bin_v2old_data/
2
+ data_remote: # If blank, files must be present in data_local
3
+
4
+ max_seq_len: 1024
5
+ tokenizer_name: tokenizer/camembertv2
6
+ mlm_probability: 0.3 # FlexBERT should use 30% masking for optimal performance
7
+ count_padding_tokens: false
8
+
9
+ # Run Name
10
+ run_name: camembertv2-base
11
+
12
+ # Model
13
+ model:
14
+ name: flex_bert
15
+ pretrained_model_name: configs/bert-base-uncased
16
+ tokenizer_name: ${tokenizer_name}
17
+ disable_train_metrics: true
18
+ # FlexBERT 'base' generally uses the default architecture values from the Hugging Face BertConfig object
19
+ # Note: if using the pretrained_checkpoint argument to create a model from an existing checkpoint, make sure
20
+ # the model_config settings match the architecture of the existing model
21
+ model_config:
22
+ vocab_size: 32768
23
+ init_method: full_megatron
24
+ num_hidden_layers: 22
25
+ hidden_size: 768
26
+ intermediate_size: 1152
27
+ num_attention_heads: 12 # to have head size of 64
28
+ attention_layer: rope
29
+ attention_probs_dropout_prob: 0.0
30
+ attn_out_bias: false
31
+ attn_out_dropout_prob: 0.1
32
+ attn_qkv_bias: false
33
+ bert_layer: prenorm
34
+ embed_dropout_prob: 0.0
35
+ embed_norm: true
36
+ final_norm: true
37
+ skip_first_prenorm: true
38
+ embedding_layer: sans_pos
39
+ loss_function: fa_cross_entropy
40
+ loss_kwargs:
41
+ reduction: mean
42
+ mlp_dropout_prob: 0.0
43
+ mlp_in_bias: false
44
+ mlp_layer: glu
45
+ mlp_out_bias: false
46
+ normalization: layernorm
47
+ norm_kwargs:
48
+ eps: 1e-5
49
+ bias: false
50
+ hidden_act: gelu
51
+ head_pred_act: gelu
52
+ activation_function: gelu # better safe than sorry
53
+ padding: unpadded
54
+ rotary_emb_dim: null
55
+ rotary_emb_base: 10000.0
56
+ rotary_emb_scale_base: null
57
+ rotary_emb_interleaved: false
58
+ allow_embedding_resizing: true
59
+ sliding_window: 128
60
+ global_attn_every_n_layers: 3
61
+ unpad_embeddings: true
62
+ compile_model: true
63
+ masked_prediction: true
64
+
65
+ # Dataloaders
66
+ train_loader:
67
+ name: litdata_tokenized
68
+ dataset:
69
+ local: ${data_local}
70
+ remote: ${data_remote}
71
+ split: train
72
+ tokenizer_name: ${tokenizer_name}
73
+ max_seq_len: ${max_seq_len}
74
+ shuffle: true
75
+ mlm_probability: ${mlm_probability}
76
+ streaming: false
77
+ shuffle_seed: ${seed}
78
+ seed: ${seed}
79
+ drop_last: true
80
+ num_workers: 12
81
+ sequence_packing: true
82
+ # batch_size_warmup_min_size: 96
83
+ # batch_size_warmup_tokens: 50_000_000_000tok
84
+
85
+ eval_loader:
86
+ name: litdata_tokenized
87
+ dataset:
88
+ local: ${data_local}
89
+ remote: ${data_remote}
90
+ split: validation
91
+ tokenizer_name: ${tokenizer_name}
92
+ max_seq_len: ${max_seq_len}
93
+ shuffle: false
94
+ mlm_probability: 0.15 # We always evaluate at 15% masking for consistent comparison
95
+ streaming: false
96
+ seed: ${seed}
97
+ drop_last: false
98
+ num_workers: 12
99
+ sequence_packing: false
100
+
101
+ # Optimization
102
+ scheduler:
103
+ name: warmup_stable_decay
104
+ t_warmup: 3_000_000_000tok
105
+ alpha_f: 0.00 # Linearly decay to 0.02x the full LR by the end of the training duration
106
+ t_decay: 0tok
107
+
108
+ optimizer:
109
+ name: decoupled_adamw
110
+ lr: 8e-4 # Peak learning rate
111
+ betas:
112
+ - 0.9
113
+ - 0.98
114
+ eps: 1.0e-06
115
+ weight_decay: 1.0e-5 # Amount of weight decay regularization
116
+ filter_bias_norm_wd: true # If True, doesn't apply weight decay to norm layers and biases
117
+ log_grad_norm: true
118
+
119
+ max_duration: 1_000_000_000_000tok
120
+ eval_interval: 5000ba
121
+ global_train_batch_size: 4608
122
+ global_eval_batch_size: 6144
123
+
124
+ # System
125
+ seed: 25
126
+ device_eval_batch_size: 128
127
+ device_train_microbatch_size: 96
128
+ precision: amp_bf16
129
+
130
+ # Logging
131
+ progress_bar: true
132
+ log_to_console: true
133
+ console_log_interval: 100ba
134
+
135
+ callbacks:
136
+ runtime_estimator: {}
137
+ dataloader_speed: {}
138
+ speed_monitor:
139
+ window_size: 100
140
+ lr_monitor: {}
141
+ scheduled_gc: {}
142
+ # log_grad_norm:
143
+ # batch_log_interval: 100
144
+ packing_efficiency:
145
+ log_interval: 100
146
+
147
+ loggers:
148
+ # wandb:
149
+ # project: fr_modernbert
150
+ # entity: wissam
151
+ tensorboard:
152
+ log_dir: /scratch/playground/logs/tensorboard/
153
+
154
+ autoresume: true
155
+ # Checkpoint to local filesystem or remote object store
156
+ save_interval: 5000ba
157
+ save_num_checkpoints_to_keep: -1 # Important, this cleans up checkpoints saved to DISK
158
+ save_folder: /scratch/playground/checkpoints/{run_name}
159
+ # Load from local filesystem or remote object store to
160
+ # load_path: null
161
+
162
+ parallelism_config:
163
+ fsdp:
164
+ sharding_strategy: "FULL_SHARD"
165
+ state_dict_type: "sharded"
166
+ # mixed_precision:
167
+ # param_dtype: bf16
168
+ # reduce_dtype: bf16
169
+ # buffer_dtype: bf16