|
{ |
|
"name": "blt_7b", |
|
"dump_dir": "/checkpoints/blt_7b", |
|
"seed": 42, |
|
"debug_dynamo": false, |
|
"grad_acc_steps": 1, |
|
"gc_collect_freq": 1000, |
|
"probe_freq": null, |
|
"steps": 240000, |
|
"max_steps": null, |
|
"data": { |
|
"s3_profile": "blt", |
|
"batch_size": 4, |
|
"seq_len": 4096, |
|
"seed": 42, |
|
"add_bos": true, |
|
"add_eos": true, |
|
"load_async": true, |
|
"async_persist_type": "approximate", |
|
"prefetch_size": 200, |
|
"preprocess_dir": "/corpora/entropy_preprocess", |
|
"dataset_files": null, |
|
"entropy_model_name": "transformer_100m", |
|
"arrow_batch_size": 20, |
|
"buffer_size": 512, |
|
"file_format": "arrow", |
|
"pad_to_max_length": true, |
|
"max_encoder_seq_length": 24576, |
|
"enable_byte_ngrams": false, |
|
"add_patches": true, |
|
"tokenizer_args": { |
|
"name": "blt", |
|
"init_kwargs": { |
|
"bpe_tokenizer_path": "/tokenizers/tokenizer_final_32k.minus_inf_ws.model" |
|
} |
|
}, |
|
"patcher_args": { |
|
"patching_mode": "entropy", |
|
"patching_device": "cuda", |
|
"entropy_model_checkpoint_dir": null, |
|
"realtime_patching": false, |
|
"threshold": 1.335442066192627, |
|
"threshold_add": null, |
|
"max_patch_length": null, |
|
"patch_size": 4.5, |
|
"patching_batch_size": 1, |
|
"device": "cuda", |
|
"monotonicity": false, |
|
"log_time": false |
|
} |
|
}, |
|
"optim": { |
|
"lr": 0.0004, |
|
"weight_decay": 0.1, |
|
"epsilon": 1e-08, |
|
"beta1": 0.9, |
|
"beta2": 0.95, |
|
"clip": 1.0, |
|
"scheduler": "cosine", |
|
"warmup": 2000, |
|
"lr_min_ratio": 0.01, |
|
"cycle_length": 1.0, |
|
"cosine_theta": 1.0, |
|
"annealing_step": 1000, |
|
"decay_fraction": 0.1, |
|
"exp_factor": 0.5 |
|
}, |
|
"model": { |
|
"dim": 512, |
|
"n_layers": 8, |
|
"head_dim": null, |
|
"n_heads": 8, |
|
"n_kv_heads": null, |
|
"ffn_dim_multiplier": 1.0, |
|
"multiple_of": 256, |
|
"norm_eps": 1e-05, |
|
"rope_theta": 500000.0, |
|
"rope_use_fp32_in_outer_product": true, |
|
"init_base_std": null, |
|
"init_std_factor": "current_depth", |
|
"max_seqlen": 4096, |
|
"attn_impl": "xformers", |
|
"attn_bias_type": "block_causal", |
|
"eos_id": 2, |
|
"seed": 42, |
|
"vocab_size": 260, |
|
"weight_tying": false, |
|
"patch_in_forward": true, |
|
"dim_token": null, |
|
"dim_global": 4096, |
|
"dim_local_decoder": 1280, |
|
"dim_local_encoder": 1280, |
|
"n_layers_global": 32, |
|
"n_layers_local_decoder": 6, |
|
"n_layers_local_encoder": 1, |
|
"patch_size": 4.5, |
|
"patching_mode": "entropy", |
|
"patching_threshold": 1.335442066192627, |
|
"patching_threshold_add": null, |
|
"monotonicity": false, |
|
"patching_batch_size": 1, |
|
"patching_device": "cuda", |
|
"max_patch_length": null, |
|
"tie_local_encoder_decoder_logits": false, |
|
"use_local_encoder_transformer": true, |
|
"encoder_lm_loss": false, |
|
"max_encoder_seq_length": 24576, |
|
"pad_to_max_length": true, |
|
"encoder_enable_byte_ngrams": false, |
|
"encoder_enable_byte_group_hash": false, |
|
"ngram_vocab_sizes": null, |
|
"cross_attn_encoder": true, |
|
"cross_attn_decoder": true, |
|
"cross_attn_window_encoder": null, |
|
"cross_attn_window_decoder": null, |
|
"cross_attn_k": 4, |
|
"cross_attn_nheads": 20, |
|
"cross_attn_all_layers_decoder": true, |
|
"cross_attn_all_layers_encoder": false, |
|
"cross_attn_use_flex_attention": true, |
|
"cross_attn_init_by_pooling": true, |
|
"encoder_hash_byte_group_size": [ |
|
3, |
|
4, |
|
5, |
|
6, |
|
7, |
|
8 |
|
], |
|
"encoder_hash_byte_group_vocab": 500002, |
|
"encoder_hash_byte_group_nb_functions": 1, |
|
"log_patch_lengths": false, |
|
"non_linearity": "swiglu", |
|
"use_rope": true, |
|
"recompute_fc1_out": false, |
|
"recompute_fc3_out": false, |
|
"recompute_attn": false, |
|
"custom_bwd": false, |
|
"layer_ckpt": "none", |
|
"init_use_gaussian": true, |
|
"init_use_depth": "current", |
|
"alpha_depth": "disabled", |
|
"max_length": 4096, |
|
"norm_affine": true, |
|
"pre_norm": true, |
|
"norm_type": "rmsnorm", |
|
"dropout": 0.0, |
|
"output_size": -1, |
|
"architecture": "vanilla", |
|
"share_encoder_decoder_emb": true, |
|
"global_local_decoder_residual_layer": null, |
|
"tokenize_with_bpe_delimiter": false, |
|
"patching_thresholds_str": null, |
|
"tie_local_encoder_decoder": false, |
|
"encoder_preds_low_entropy_toks": null, |
|
"encoder_preds_random_toks": null, |
|
"dim_token_emb": null, |
|
"dim_patch_emb": null, |
|
"encoder_ngram_table_dir": null, |
|
"encoder_ngram_to_size_str": null, |
|
"entropy_model_checkpoint_dir": null, |
|
"entropy_model_is_ngram_model": false, |
|
"downsampling_by_pooling": "max", |
|
"n_heads_global": 32, |
|
"n_heads_local_decoder": 20, |
|
"n_heads_local_encoder": 20, |
|
"n_kv_heads_global": null, |
|
"conv_kernel_size": null, |
|
"local_attention_window_len": 512, |
|
"sequence_parallel": false, |
|
"loss_parallel": false, |
|
"fuse_sequence_parallel": false, |
|
"use_fsdp": true, |
|
"attn_to_keep": "all", |
|
"pm_size": 0, |
|
"full_logging_n_layers": 4 |
|
}, |
|
"entropy_model": null, |
|
"train_entropy_model": false, |
|
"distributed": { |
|
"dp_shard": 1, |
|
"dp_replicate": 256, |
|
"tp_size": 1, |
|
"selective_activation_checkpointing": true, |
|
"compile": false, |
|
"fsdp_type": "full_shard", |
|
"model_dtype": "bf16", |
|
"float8_recipe": null, |
|
"float8_filter": "layers\\.[0-9]+\\.", |
|
"matmul_allow_tf32": false, |
|
"allow_bf16_reduced_precision_reduction": true, |
|
"detect_anomaly": false, |
|
"compile_cache_size_limit": 8, |
|
"spawn_method": "forkserver" |
|
}, |
|
"env": { |
|
"MKL_SERVICE_FORCE_INTEL": "GNU", |
|
"OMP_NUM_THREADS": "1", |
|
"MKL_NUM_THREADS": "1", |
|
"ENABLE_INTRA_NODE_COMM": "1", |
|
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1", |
|
"NCCL_IB_TIMEOUT": "22", |
|
"NCCL_DEBUG": "INFO", |
|
"TORCH_NCCL_ASYNC_ERROR_HANDLING": "1" |
|
}, |
|
"checkpoint": { |
|
"dump": { |
|
"every": 1000, |
|
"keep": 1 |
|
}, |
|
"eval": { |
|
"every": 100000, |
|
"keep": -1 |
|
}, |
|
"path": "/checkpoints/blt_7b", |
|
"init_ckpt_path": null, |
|
"continue_training_from_init": false, |
|
"s3_profile": null |
|
}, |
|
"profiling": { |
|
"run": false, |
|
"trace_folder": "profiling", |
|
"mem_warmup": 0, |
|
"mem_steps": 4, |
|
"profile_warmup": 100, |
|
"profile_steps": 4 |
|
}, |
|
"logging": { |
|
"freq": 10, |
|
"acc_freq": null, |
|
"wandb": { |
|
"job_type": "train", |
|
"dir": null, |
|
"project": "blt", |
|
"entity": "blt", |
|
"tags": null, |
|
"group": null, |
|
"name": "blt_7b", |
|
"notes": null, |
|
"config_exclude_keys": null, |
|
"config_include_keys": null, |
|
"anonymous": null, |
|
"mode": null, |
|
"allow_val_change": null, |
|
"resume": null, |
|
"force": null, |
|
"tensorboard": null, |
|
"sync_tensorboard": null, |
|
"monitor_gym": null, |
|
"save_code": null, |
|
"id": null, |
|
"fork_from": null, |
|
"resume_from": null |
|
} |
|
}, |
|
"async_eval_gpus": null, |
|
"eval": null, |
|
"eval_on_gpus": 8 |
|
} |
|
|