|
optimizer: |
|
allreduce_bucket_size: 500000000 |
|
beta1: 0.9 |
|
beta2: 0.95 |
|
debug_log: false |
|
eps: 1e-08 |
|
gradient_clipping: 0.0 |
|
zero: true |
|
zero_save_static: false |
|
topology: |
|
activation_checkpointing_type: disabled |
|
global_batch_size: 1024 |
|
gradient_accumulation_steps: 4 |
|
micro_batch_size: 2 |
|
model_parallel_size: 1 |
|
pipe_parallel_size: 2 |
|
pipe_partition_method: balanced |
|
pipe_partition_overwrite: null |
|
sequence_parallel: false |
|
trainer: |
|
seed: 42 |
|
train_iterations: 72000 |
|
training: |
|
allow_missing_params_in_optimizer: true |
|
training_groups: |
|
- group_name: param_group |
|
independent_weight_decay: true |
|
learning_rate_scheduler: |
|
learning_rate: 11.313708498984761 |
|
learning_rate_decay_iters: 72000 |
|
learning_rate_decay_style: cosine |
|
learning_rate_minimum: 1.131370849898476 |
|
learning_rate_warmup_steps: 500 |
|
parameters_exclude: |
|
- norm |
|
weight_decay: 0.0001221 |
|
transformer_architecture: |
|
attention_bias: false |
|
attention_num_kv_heads: null |
|
attention_qkv_in_one: true |
|
dropout_after_attention: 0.0 |
|
dropout_after_mlp: 0.0 |
|
dropout_attention_probs: 0.0 |
|
dropout_embedding: 0.0 |
|
dropout_image_encoder: 0.0 |
|
fp8_config_attention: |
|
dtypes_forward: |
|
left_dtype: e4m3 |
|
right_dtype: e4m3 |
|
dtypes_grad_input: |
|
left_dtype: e5m2 |
|
right_dtype: e4m3 |
|
dtypes_grad_weight: |
|
left_dtype: e4m3 |
|
right_dtype: e5m2 |
|
fp8_config_mlp: |
|
dtypes_forward: |
|
left_dtype: e4m3 |
|
right_dtype: e4m3 |
|
dtypes_grad_input: |
|
left_dtype: e5m2 |
|
right_dtype: e4m3 |
|
dtypes_grad_weight: |
|
left_dtype: e4m3 |
|
right_dtype: e5m2 |
|
hidden_size: 4096 |
|
image_encoder: false |
|
key_query_norm: false |
|
layernorm: |
|
layernorm_epsilon: 1e-05 |
|
optimization_type: torch |
|
local_attention_window_size: null |
|
masked_softmax: |
|
kernel: flash_attention |
|
scale: 1.0 |
|
softmax_in_fp32: false |
|
mlp_bias: false |
|
mlp_factor: 2.66796875 |
|
mlp_type: swiglu |
|
norm_type: rms |
|
num_attention_heads: 32 |
|
num_layers: 32 |
|
num_local_attention_heads: 0 |
|
precision: bfloat16 |
|
relative_position_embedding_type: rotary_complex |
|
reset_attention_mask: false |
|
reset_position_ids: false |
|
rotary_embedding_base: 10000 |
|
rotary_percentage: 1.0 |
|
sequence_length: 4096 |
|
umup: |
|
act_mult: 1.0 |
|
attn_mult: 1.0 |
|
enable: true |
|
loss_mult: 1.0 |
|
normalize_depth_to_num_layers: true |
|
residual_attn_ratio: 0.25 |
|
residual_mult: 1.0 |
|
vocab_file: null |
|
vocab_size: 65536 |
|
weight_tying: false |
|
|