Upload 3 files
Browse files- camembertv2-base-p2.yaml +175 -0
- camembertv2-base-p3.yaml +175 -0
- camembertv2-base.yaml +169 -0
camembertv2-base-p2.yaml
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data_local: /scratch/playground/data/bin_v2old_data_long/
|
2 |
+
data_remote: # If blank, files must be present in data_local
|
3 |
+
|
4 |
+
max_seq_len: 8192
|
5 |
+
tokenizer_name: tokenizer/camembertv2
|
6 |
+
mlm_probability: 0.3 # FlexBERT should use 30% masking for optimal performance
|
7 |
+
count_padding_tokens: false
|
8 |
+
reset_time: true
|
9 |
+
restart_override: true
|
10 |
+
reset_dataloader: true
|
11 |
+
|
12 |
+
# Run Name
|
13 |
+
run_name: camembertv2-base-p2
|
14 |
+
pretrain_run_name: camembertv2-base
|
15 |
+
load_path: /scratch/playground/checkpoints/${pretrain_run_name}/latest-rank0.pt
|
16 |
+
|
17 |
+
# Model
|
18 |
+
model:
|
19 |
+
name: flex_bert
|
20 |
+
pretrained_model_name: configs/bert-base-uncased
|
21 |
+
tokenizer_name: ${tokenizer_name}
|
22 |
+
disable_train_metrics: true
|
23 |
+
# FlexBERT 'base' generally uses the default architecture values from the Hugging Face BertConfig object
|
24 |
+
# Note: if using the pretrained_checkpoint argument to create a model from an existing checkpoint, make sure
|
25 |
+
# the model_config settings match the architecture of the existing model
|
26 |
+
model_config:
|
27 |
+
vocab_size: 32768
|
28 |
+
init_method: full_megatron
|
29 |
+
num_hidden_layers: 22
|
30 |
+
hidden_size: 768
|
31 |
+
intermediate_size: 1152
|
32 |
+
num_attention_heads: 12 # to have head size of 64
|
33 |
+
attention_layer: rope
|
34 |
+
attention_probs_dropout_prob: 0.0
|
35 |
+
attn_out_bias: false
|
36 |
+
attn_out_dropout_prob: 0.1
|
37 |
+
attn_qkv_bias: false
|
38 |
+
bert_layer: prenorm
|
39 |
+
embed_dropout_prob: 0.0
|
40 |
+
embed_norm: true
|
41 |
+
final_norm: true
|
42 |
+
skip_first_prenorm: true
|
43 |
+
embedding_layer: sans_pos
|
44 |
+
loss_function: fa_cross_entropy
|
45 |
+
loss_kwargs:
|
46 |
+
reduction: mean
|
47 |
+
mlp_dropout_prob: 0.0
|
48 |
+
mlp_in_bias: false
|
49 |
+
mlp_layer: glu
|
50 |
+
mlp_out_bias: false
|
51 |
+
normalization: layernorm
|
52 |
+
norm_kwargs:
|
53 |
+
eps: 1e-5
|
54 |
+
bias: false
|
55 |
+
hidden_act: gelu
|
56 |
+
head_pred_act: gelu
|
57 |
+
activation_function: gelu # better safe than sorry
|
58 |
+
padding: unpadded
|
59 |
+
rotary_emb_dim: null
|
60 |
+
rotary_emb_base: 160000.0
|
61 |
+
rotary_emb_scale_base: null
|
62 |
+
rotary_emb_interleaved: false
|
63 |
+
local_attn_rotary_emb_base: 10000.0
|
64 |
+
allow_embedding_resizing: true
|
65 |
+
sliding_window: 128
|
66 |
+
global_attn_every_n_layers: 3
|
67 |
+
unpad_embeddings: true
|
68 |
+
compile_model: true
|
69 |
+
masked_prediction: true
|
70 |
+
|
71 |
+
# Dataloaders
|
72 |
+
train_loader:
|
73 |
+
name: litdata_tokenized
|
74 |
+
dataset:
|
75 |
+
local: ${data_local}
|
76 |
+
remote: ${data_remote}
|
77 |
+
split: train
|
78 |
+
tokenizer_name: ${tokenizer_name}
|
79 |
+
max_seq_len: ${max_seq_len}
|
80 |
+
shuffle: true
|
81 |
+
mlm_probability: ${mlm_probability}
|
82 |
+
streaming: false
|
83 |
+
shuffle_seed: ${seed}
|
84 |
+
seed: ${seed}
|
85 |
+
drop_last: true
|
86 |
+
num_workers: 12
|
87 |
+
sequence_packing: true
|
88 |
+
# batch_size_warmup_min_size: 96
|
89 |
+
# batch_size_warmup_tokens: 50_000_000_000tok
|
90 |
+
|
91 |
+
eval_loader:
|
92 |
+
name: litdata_tokenized
|
93 |
+
dataset:
|
94 |
+
local: ${data_local}
|
95 |
+
remote: ${data_remote}
|
96 |
+
split: validation
|
97 |
+
tokenizer_name: ${tokenizer_name}
|
98 |
+
max_seq_len: ${max_seq_len}
|
99 |
+
shuffle: false
|
100 |
+
mlm_probability: 0.15 # We always evaluate at 15% masking for consistent comparison
|
101 |
+
streaming: false
|
102 |
+
seed: ${seed}
|
103 |
+
drop_last: false
|
104 |
+
num_workers: 12
|
105 |
+
sequence_packing: false
|
106 |
+
|
107 |
+
# Optimization
|
108 |
+
scheduler:
|
109 |
+
name: warmup_stable_decay
|
110 |
+
t_warmup: 0tok
|
111 |
+
alpha_f: 0.00 # Linearly decay to 0.02x the full LR by the end of the training duration
|
112 |
+
t_decay: 0tok
|
113 |
+
|
114 |
+
optimizer:
|
115 |
+
name: decoupled_adamw
|
116 |
+
lr: 3e-4 # Peak learning rate
|
117 |
+
betas:
|
118 |
+
- 0.9
|
119 |
+
- 0.98
|
120 |
+
eps: 1.0e-06
|
121 |
+
weight_decay: 1.0e-5 # Amount of weight decay regularization
|
122 |
+
filter_bias_norm_wd: true # If True, doesn't apply weight decay to norm layers and biases
|
123 |
+
log_grad_norm: true
|
124 |
+
|
125 |
+
max_duration: 150_000_000_000tok
|
126 |
+
eval_interval: 5000ba
|
127 |
+
global_train_batch_size: 768
|
128 |
+
global_eval_batch_size: 6144
|
129 |
+
|
130 |
+
# System
|
131 |
+
seed: 25
|
132 |
+
device_eval_batch_size: 32
|
133 |
+
device_train_microbatch_size: 8
|
134 |
+
precision: amp_bf16
|
135 |
+
|
136 |
+
# Logging
|
137 |
+
progress_bar: false
|
138 |
+
log_to_console: true
|
139 |
+
console_log_interval: 100ba
|
140 |
+
|
141 |
+
callbacks:
|
142 |
+
runtime_estimator: {}
|
143 |
+
dataloader_speed: {}
|
144 |
+
speed_monitor:
|
145 |
+
window_size: 100
|
146 |
+
lr_monitor: {}
|
147 |
+
scheduled_gc: {}
|
148 |
+
# log_grad_norm:
|
149 |
+
# batch_log_interval: 100
|
150 |
+
packing_efficiency:
|
151 |
+
log_interval: 100
|
152 |
+
|
153 |
+
loggers:
|
154 |
+
# wandb:
|
155 |
+
# project: fr_modernbert
|
156 |
+
# entity: wissam
|
157 |
+
tensorboard:
|
158 |
+
log_dir: /scratch/playground/logs/tensorboard/p2/
|
159 |
+
|
160 |
+
autoresume: true
|
161 |
+
# Checkpoint to local filesystem or remote object store
|
162 |
+
save_interval: 5000ba
|
163 |
+
save_num_checkpoints_to_keep: -1 # Important, this cleans up checkpoints saved to DISK
|
164 |
+
save_folder: /scratch/playground/checkpoints/{run_name}
|
165 |
+
# Load from local filesystem or remote object store to
|
166 |
+
# load_path: null
|
167 |
+
|
168 |
+
parallelism_config:
|
169 |
+
fsdp:
|
170 |
+
sharding_strategy: "FULL_SHARD"
|
171 |
+
state_dict_type: "sharded"
|
172 |
+
# mixed_precision:
|
173 |
+
# param_dtype: bf16
|
174 |
+
# reduce_dtype: bf16
|
175 |
+
# buffer_dtype: bf16
|
camembertv2-base-p3.yaml
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data_local: /scratch/playground/data/bin_v2old_data_high/
|
2 |
+
data_remote: # If blank, files must be present in data_local
|
3 |
+
|
4 |
+
max_seq_len: 8192
|
5 |
+
tokenizer_name: tokenizer/camembertv2
|
6 |
+
mlm_probability: 0.3 # FlexBERT should use 30% masking for optimal performance
|
7 |
+
count_padding_tokens: false
|
8 |
+
reset_time: true
|
9 |
+
restart_override: true
|
10 |
+
reset_dataloader: true
|
11 |
+
|
12 |
+
# Run Name
|
13 |
+
run_name: camembertv2-base-p3
|
14 |
+
pretrain_run_name: camembertv2-base-p2
|
15 |
+
load_path: /scratch/playground/checkpoints/${pretrain_run_name}/latest-rank0.pt
|
16 |
+
|
17 |
+
# Model
|
18 |
+
model:
|
19 |
+
name: flex_bert
|
20 |
+
pretrained_model_name: configs/bert-base-uncased
|
21 |
+
tokenizer_name: ${tokenizer_name}
|
22 |
+
disable_train_metrics: true
|
23 |
+
# FlexBERT 'base' generally uses the default architecture values from the Hugging Face BertConfig object
|
24 |
+
# Note: if using the pretrained_checkpoint argument to create a model from an existing checkpoint, make sure
|
25 |
+
# the model_config settings match the architecture of the existing model
|
26 |
+
model_config:
|
27 |
+
vocab_size: 32768
|
28 |
+
init_method: full_megatron
|
29 |
+
num_hidden_layers: 22
|
30 |
+
hidden_size: 768
|
31 |
+
intermediate_size: 1152
|
32 |
+
num_attention_heads: 12 # to have head size of 64
|
33 |
+
attention_layer: rope
|
34 |
+
attention_probs_dropout_prob: 0.0
|
35 |
+
attn_out_bias: false
|
36 |
+
attn_out_dropout_prob: 0.1
|
37 |
+
attn_qkv_bias: false
|
38 |
+
bert_layer: prenorm
|
39 |
+
embed_dropout_prob: 0.0
|
40 |
+
embed_norm: true
|
41 |
+
final_norm: true
|
42 |
+
skip_first_prenorm: true
|
43 |
+
embedding_layer: sans_pos
|
44 |
+
loss_function: fa_cross_entropy
|
45 |
+
loss_kwargs:
|
46 |
+
reduction: mean
|
47 |
+
mlp_dropout_prob: 0.0
|
48 |
+
mlp_in_bias: false
|
49 |
+
mlp_layer: glu
|
50 |
+
mlp_out_bias: false
|
51 |
+
normalization: layernorm
|
52 |
+
norm_kwargs:
|
53 |
+
eps: 1e-5
|
54 |
+
bias: false
|
55 |
+
hidden_act: gelu
|
56 |
+
head_pred_act: gelu
|
57 |
+
activation_function: gelu # better safe than sorry
|
58 |
+
padding: unpadded
|
59 |
+
rotary_emb_dim: null
|
60 |
+
rotary_emb_base: 160000.0
|
61 |
+
rotary_emb_scale_base: null
|
62 |
+
rotary_emb_interleaved: false
|
63 |
+
local_attn_rotary_emb_base: 10000.0
|
64 |
+
allow_embedding_resizing: true
|
65 |
+
sliding_window: 128
|
66 |
+
global_attn_every_n_layers: 3
|
67 |
+
unpad_embeddings: true
|
68 |
+
compile_model: true
|
69 |
+
masked_prediction: true
|
70 |
+
|
71 |
+
# Dataloaders
|
72 |
+
train_loader:
|
73 |
+
name: litdata_tokenized
|
74 |
+
dataset:
|
75 |
+
local: ${data_local}
|
76 |
+
remote: ${data_remote}
|
77 |
+
split: train
|
78 |
+
tokenizer_name: ${tokenizer_name}
|
79 |
+
max_seq_len: ${max_seq_len}
|
80 |
+
shuffle: true
|
81 |
+
mlm_probability: ${mlm_probability}
|
82 |
+
streaming: false
|
83 |
+
shuffle_seed: ${seed}
|
84 |
+
seed: ${seed}
|
85 |
+
drop_last: true
|
86 |
+
num_workers: 12
|
87 |
+
sequence_packing: true
|
88 |
+
# batch_size_warmup_min_size: 96
|
89 |
+
# batch_size_warmup_tokens: 50_000_000_000tok
|
90 |
+
|
91 |
+
eval_loader:
|
92 |
+
name: litdata_tokenized
|
93 |
+
dataset:
|
94 |
+
local: ${data_local}
|
95 |
+
remote: ${data_remote}
|
96 |
+
split: validation
|
97 |
+
tokenizer_name: ${tokenizer_name}
|
98 |
+
max_seq_len: ${max_seq_len}
|
99 |
+
shuffle: false
|
100 |
+
mlm_probability: 0.15 # We always evaluate at 15% masking for consistent comparison
|
101 |
+
streaming: false
|
102 |
+
seed: ${seed}
|
103 |
+
drop_last: false
|
104 |
+
num_workers: 12
|
105 |
+
sequence_packing: false
|
106 |
+
|
107 |
+
# Optimization
|
108 |
+
scheduler:
|
109 |
+
name: one_minus_sqrt
|
110 |
+
t_decay: 50_000_000_000tok
|
111 |
+
alpha_f: 0.00 # Linearly decay to 0.02x the full LR by the end of the training duration
|
112 |
+
t_max: 100_000_000_000tok
|
113 |
+
|
114 |
+
optimizer:
|
115 |
+
name: decoupled_adamw
|
116 |
+
lr: 3e-4 # Peak learning rate
|
117 |
+
betas:
|
118 |
+
- 0.9
|
119 |
+
- 0.98
|
120 |
+
eps: 1.0e-06
|
121 |
+
weight_decay: 1.0e-5 # Amount of weight decay regularization
|
122 |
+
filter_bias_norm_wd: true # If True, doesn't apply weight decay to norm layers and biases
|
123 |
+
log_grad_norm: true
|
124 |
+
|
125 |
+
max_duration: 100_000_000_000tok
|
126 |
+
eval_interval: 5000ba
|
127 |
+
global_train_batch_size: 768
|
128 |
+
global_eval_batch_size: 6144
|
129 |
+
|
130 |
+
# System
|
131 |
+
seed: 25
|
132 |
+
device_eval_batch_size: 32
|
133 |
+
device_train_microbatch_size: 8
|
134 |
+
precision: amp_bf16
|
135 |
+
|
136 |
+
# Logging
|
137 |
+
progress_bar: false
|
138 |
+
log_to_console: true
|
139 |
+
console_log_interval: 100ba
|
140 |
+
|
141 |
+
callbacks:
|
142 |
+
runtime_estimator: {}
|
143 |
+
dataloader_speed: {}
|
144 |
+
speed_monitor:
|
145 |
+
window_size: 100
|
146 |
+
lr_monitor: {}
|
147 |
+
scheduled_gc: {}
|
148 |
+
# log_grad_norm:
|
149 |
+
# batch_log_interval: 100
|
150 |
+
packing_efficiency:
|
151 |
+
log_interval: 100
|
152 |
+
|
153 |
+
loggers:
|
154 |
+
# wandb:
|
155 |
+
# project: fr_modernbert
|
156 |
+
# entity: wissam
|
157 |
+
tensorboard:
|
158 |
+
log_dir: /scratch/playground/logs/tensorboard/p3/
|
159 |
+
|
160 |
+
autoresume: true
|
161 |
+
# Checkpoint to local filesystem or remote object store
|
162 |
+
save_interval: 5000ba
|
163 |
+
save_num_checkpoints_to_keep: -1 # Important, this cleans up checkpoints saved to DISK
|
164 |
+
save_folder: /scratch/playground/checkpoints/{run_name}
|
165 |
+
# Load from local filesystem or remote object store to
|
166 |
+
# load_path: null
|
167 |
+
|
168 |
+
parallelism_config:
|
169 |
+
fsdp:
|
170 |
+
sharding_strategy: "FULL_SHARD"
|
171 |
+
state_dict_type: "sharded"
|
172 |
+
# mixed_precision:
|
173 |
+
# param_dtype: bf16
|
174 |
+
# reduce_dtype: bf16
|
175 |
+
# buffer_dtype: bf16
|
camembertv2-base.yaml
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data_local: /scratch/playground/data/bin_v2old_data/
|
2 |
+
data_remote: # If blank, files must be present in data_local
|
3 |
+
|
4 |
+
max_seq_len: 1024
|
5 |
+
tokenizer_name: tokenizer/camembertv2
|
6 |
+
mlm_probability: 0.3 # FlexBERT should use 30% masking for optimal performance
|
7 |
+
count_padding_tokens: false
|
8 |
+
|
9 |
+
# Run Name
|
10 |
+
run_name: camembertv2-base
|
11 |
+
|
12 |
+
# Model
|
13 |
+
model:
|
14 |
+
name: flex_bert
|
15 |
+
pretrained_model_name: configs/bert-base-uncased
|
16 |
+
tokenizer_name: ${tokenizer_name}
|
17 |
+
disable_train_metrics: true
|
18 |
+
# FlexBERT 'base' generally uses the default architecture values from the Hugging Face BertConfig object
|
19 |
+
# Note: if using the pretrained_checkpoint argument to create a model from an existing checkpoint, make sure
|
20 |
+
# the model_config settings match the architecture of the existing model
|
21 |
+
model_config:
|
22 |
+
vocab_size: 32768
|
23 |
+
init_method: full_megatron
|
24 |
+
num_hidden_layers: 22
|
25 |
+
hidden_size: 768
|
26 |
+
intermediate_size: 1152
|
27 |
+
num_attention_heads: 12 # to have head size of 64
|
28 |
+
attention_layer: rope
|
29 |
+
attention_probs_dropout_prob: 0.0
|
30 |
+
attn_out_bias: false
|
31 |
+
attn_out_dropout_prob: 0.1
|
32 |
+
attn_qkv_bias: false
|
33 |
+
bert_layer: prenorm
|
34 |
+
embed_dropout_prob: 0.0
|
35 |
+
embed_norm: true
|
36 |
+
final_norm: true
|
37 |
+
skip_first_prenorm: true
|
38 |
+
embedding_layer: sans_pos
|
39 |
+
loss_function: fa_cross_entropy
|
40 |
+
loss_kwargs:
|
41 |
+
reduction: mean
|
42 |
+
mlp_dropout_prob: 0.0
|
43 |
+
mlp_in_bias: false
|
44 |
+
mlp_layer: glu
|
45 |
+
mlp_out_bias: false
|
46 |
+
normalization: layernorm
|
47 |
+
norm_kwargs:
|
48 |
+
eps: 1e-5
|
49 |
+
bias: false
|
50 |
+
hidden_act: gelu
|
51 |
+
head_pred_act: gelu
|
52 |
+
activation_function: gelu # better safe than sorry
|
53 |
+
padding: unpadded
|
54 |
+
rotary_emb_dim: null
|
55 |
+
rotary_emb_base: 10000.0
|
56 |
+
rotary_emb_scale_base: null
|
57 |
+
rotary_emb_interleaved: false
|
58 |
+
allow_embedding_resizing: true
|
59 |
+
sliding_window: 128
|
60 |
+
global_attn_every_n_layers: 3
|
61 |
+
unpad_embeddings: true
|
62 |
+
compile_model: true
|
63 |
+
masked_prediction: true
|
64 |
+
|
65 |
+
# Dataloaders
|
66 |
+
train_loader:
|
67 |
+
name: litdata_tokenized
|
68 |
+
dataset:
|
69 |
+
local: ${data_local}
|
70 |
+
remote: ${data_remote}
|
71 |
+
split: train
|
72 |
+
tokenizer_name: ${tokenizer_name}
|
73 |
+
max_seq_len: ${max_seq_len}
|
74 |
+
shuffle: true
|
75 |
+
mlm_probability: ${mlm_probability}
|
76 |
+
streaming: false
|
77 |
+
shuffle_seed: ${seed}
|
78 |
+
seed: ${seed}
|
79 |
+
drop_last: true
|
80 |
+
num_workers: 12
|
81 |
+
sequence_packing: true
|
82 |
+
# batch_size_warmup_min_size: 96
|
83 |
+
# batch_size_warmup_tokens: 50_000_000_000tok
|
84 |
+
|
85 |
+
eval_loader:
|
86 |
+
name: litdata_tokenized
|
87 |
+
dataset:
|
88 |
+
local: ${data_local}
|
89 |
+
remote: ${data_remote}
|
90 |
+
split: validation
|
91 |
+
tokenizer_name: ${tokenizer_name}
|
92 |
+
max_seq_len: ${max_seq_len}
|
93 |
+
shuffle: false
|
94 |
+
mlm_probability: 0.15 # We always evaluate at 15% masking for consistent comparison
|
95 |
+
streaming: false
|
96 |
+
seed: ${seed}
|
97 |
+
drop_last: false
|
98 |
+
num_workers: 12
|
99 |
+
sequence_packing: false
|
100 |
+
|
101 |
+
# Optimization
|
102 |
+
scheduler:
|
103 |
+
name: warmup_stable_decay
|
104 |
+
t_warmup: 3_000_000_000tok
|
105 |
+
alpha_f: 0.00 # Linearly decay to 0.02x the full LR by the end of the training duration
|
106 |
+
t_decay: 0tok
|
107 |
+
|
108 |
+
optimizer:
|
109 |
+
name: decoupled_adamw
|
110 |
+
lr: 8e-4 # Peak learning rate
|
111 |
+
betas:
|
112 |
+
- 0.9
|
113 |
+
- 0.98
|
114 |
+
eps: 1.0e-06
|
115 |
+
weight_decay: 1.0e-5 # Amount of weight decay regularization
|
116 |
+
filter_bias_norm_wd: true # If True, doesn't apply weight decay to norm layers and biases
|
117 |
+
log_grad_norm: true
|
118 |
+
|
119 |
+
max_duration: 1_000_000_000_000tok
|
120 |
+
eval_interval: 5000ba
|
121 |
+
global_train_batch_size: 4608
|
122 |
+
global_eval_batch_size: 6144
|
123 |
+
|
124 |
+
# System
|
125 |
+
seed: 25
|
126 |
+
device_eval_batch_size: 128
|
127 |
+
device_train_microbatch_size: 96
|
128 |
+
precision: amp_bf16
|
129 |
+
|
130 |
+
# Logging
|
131 |
+
progress_bar: true
|
132 |
+
log_to_console: true
|
133 |
+
console_log_interval: 100ba
|
134 |
+
|
135 |
+
callbacks:
|
136 |
+
runtime_estimator: {}
|
137 |
+
dataloader_speed: {}
|
138 |
+
speed_monitor:
|
139 |
+
window_size: 100
|
140 |
+
lr_monitor: {}
|
141 |
+
scheduled_gc: {}
|
142 |
+
# log_grad_norm:
|
143 |
+
# batch_log_interval: 100
|
144 |
+
packing_efficiency:
|
145 |
+
log_interval: 100
|
146 |
+
|
147 |
+
loggers:
|
148 |
+
# wandb:
|
149 |
+
# project: fr_modernbert
|
150 |
+
# entity: wissam
|
151 |
+
tensorboard:
|
152 |
+
log_dir: /scratch/playground/logs/tensorboard/
|
153 |
+
|
154 |
+
autoresume: true
|
155 |
+
# Checkpoint to local filesystem or remote object store
|
156 |
+
save_interval: 5000ba
|
157 |
+
save_num_checkpoints_to_keep: -1 # Important, this cleans up checkpoints saved to DISK
|
158 |
+
save_folder: /scratch/playground/checkpoints/{run_name}
|
159 |
+
# Load from local filesystem or remote object store to
|
160 |
+
# load_path: null
|
161 |
+
|
162 |
+
parallelism_config:
|
163 |
+
fsdp:
|
164 |
+
sharding_strategy: "FULL_SHARD"
|
165 |
+
state_dict_type: "sharded"
|
166 |
+
# mixed_precision:
|
167 |
+
# param_dtype: bf16
|
168 |
+
# reduce_dtype: bf16
|
169 |
+
# buffer_dtype: bf16
|