config: | |
vocab_size: 128128 | |
bos_token_id: 2 | |
eos_token_id: 1 | |
pad_token_id: 3 | |
torch_dtype: bfloat16 | |
use_cache: false | |
max_position_embeddings: 2048 | |
_attn_implementation: flash_attention_2 | |
name: small | |
model_type: llama | |
head_dim: 128 | |
hidden_size: 768 | |
hidden_act: silu | |
intermediate_size: 2048 | |
initializer_range: 0.02 | |
num_hidden_layers: 6 | |
num_attention_heads: 6 | |
num_key_value_heads: 6 | |
rms_norm_eps: 1.0e-05 | |
tie_word_embeddings: true | |
rope_theta: 10000.0 | |
rope_scaling: null | |
attention_bias: false | |
mlp_bias: false | |
attention_dropout: 0.0 | |
pretraining_tp: 1 | |
optim_config: | |
optim_name: adamw | |
lr: 0.0006 | |
weight_decay: 0.01 | |
weight_decay_embedding: false | |
set_grad_to_none: true | |
optim_kwargs: | |
fused: true | |
eps: 1.0e-08 | |
betas: | |
- 0.9 | |
- 0.95 | |
capturable: true | |
scheduler_name: warmup_stable_decay | |
num_warmup_steps: 2000 | |
scheduler_kwargs: | |
num_decay_steps: 4000 | |
min_lr_ratio: 0.0 | |
grad_acc_schedule: | |
0: 2 | |
zloss_factor: null | |
use_torch_compile: true | |
use_liger: true | |
train_data_path: /home/pl487/unimixlm/data/multigram128k/train | |
val_data_path: /home/pl487/unimixlm/data/multigram128k/validation | |
seq_len: 2048 | |
eos_token_id: 1 | |
dataloader_config: | |
batch_size: 64 | |
eval_batch_size: 64 | |
shuffle_seed: 42 | |
intra_doc_causal_mask: true | |
num_workers: 8 | |
pin_memory: true | |
drop_last: true | |
persistent_workers: false | |
multiprocessing_context: null | |
prefetch_factor: 2 | |