blt-7b / params.json
mrfakename's picture
Duplicate from Pclanglais/blt-7b
e384325 verified
{
"name": "blt_7b",
"dump_dir": "/checkpoints/blt_7b",
"seed": 42,
"debug_dynamo": false,
"grad_acc_steps": 1,
"gc_collect_freq": 1000,
"probe_freq": null,
"steps": 240000,
"max_steps": null,
"data": {
"s3_profile": "blt",
"batch_size": 4,
"seq_len": 4096,
"seed": 42,
"add_bos": true,
"add_eos": true,
"load_async": true,
"async_persist_type": "approximate",
"prefetch_size": 200,
"preprocess_dir": "/corpora/entropy_preprocess",
"dataset_files": null,
"entropy_model_name": "transformer_100m",
"arrow_batch_size": 20,
"buffer_size": 512,
"file_format": "arrow",
"pad_to_max_length": true,
"max_encoder_seq_length": 24576,
"enable_byte_ngrams": false,
"add_patches": true,
"tokenizer_args": {
"name": "blt",
"init_kwargs": {
"bpe_tokenizer_path": "/tokenizers/tokenizer_final_32k.minus_inf_ws.model"
}
},
"patcher_args": {
"patching_mode": "entropy",
"patching_device": "cuda",
"entropy_model_checkpoint_dir": null,
"realtime_patching": false,
"threshold": 1.335442066192627,
"threshold_add": null,
"max_patch_length": null,
"patch_size": 4.5,
"patching_batch_size": 1,
"device": "cuda",
"monotonicity": false,
"log_time": false
}
},
"optim": {
"lr": 0.0004,
"weight_decay": 0.1,
"epsilon": 1e-08,
"beta1": 0.9,
"beta2": 0.95,
"clip": 1.0,
"scheduler": "cosine",
"warmup": 2000,
"lr_min_ratio": 0.01,
"cycle_length": 1.0,
"cosine_theta": 1.0,
"annealing_step": 1000,
"decay_fraction": 0.1,
"exp_factor": 0.5
},
"model": {
"dim": 512,
"n_layers": 8,
"head_dim": null,
"n_heads": 8,
"n_kv_heads": null,
"ffn_dim_multiplier": 1.0,
"multiple_of": 256,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"rope_use_fp32_in_outer_product": true,
"init_base_std": null,
"init_std_factor": "current_depth",
"max_seqlen": 4096,
"attn_impl": "xformers",
"attn_bias_type": "block_causal",
"eos_id": 2,
"seed": 42,
"vocab_size": 260,
"weight_tying": false,
"patch_in_forward": true,
"dim_token": null,
"dim_global": 4096,
"dim_local_decoder": 1280,
"dim_local_encoder": 1280,
"n_layers_global": 32,
"n_layers_local_decoder": 6,
"n_layers_local_encoder": 1,
"patch_size": 4.5,
"patching_mode": "entropy",
"patching_threshold": 1.335442066192627,
"patching_threshold_add": null,
"monotonicity": false,
"patching_batch_size": 1,
"patching_device": "cuda",
"max_patch_length": null,
"tie_local_encoder_decoder_logits": false,
"use_local_encoder_transformer": true,
"encoder_lm_loss": false,
"max_encoder_seq_length": 24576,
"pad_to_max_length": true,
"encoder_enable_byte_ngrams": false,
"encoder_enable_byte_group_hash": false,
"ngram_vocab_sizes": null,
"cross_attn_encoder": true,
"cross_attn_decoder": true,
"cross_attn_window_encoder": null,
"cross_attn_window_decoder": null,
"cross_attn_k": 4,
"cross_attn_nheads": 20,
"cross_attn_all_layers_decoder": true,
"cross_attn_all_layers_encoder": false,
"cross_attn_use_flex_attention": true,
"cross_attn_init_by_pooling": true,
"encoder_hash_byte_group_size": [
3,
4,
5,
6,
7,
8
],
"encoder_hash_byte_group_vocab": 500002,
"encoder_hash_byte_group_nb_functions": 1,
"log_patch_lengths": false,
"non_linearity": "swiglu",
"use_rope": true,
"recompute_fc1_out": false,
"recompute_fc3_out": false,
"recompute_attn": false,
"custom_bwd": false,
"layer_ckpt": "none",
"init_use_gaussian": true,
"init_use_depth": "current",
"alpha_depth": "disabled",
"max_length": 4096,
"norm_affine": true,
"pre_norm": true,
"norm_type": "rmsnorm",
"dropout": 0.0,
"output_size": -1,
"architecture": "vanilla",
"share_encoder_decoder_emb": true,
"global_local_decoder_residual_layer": null,
"tokenize_with_bpe_delimiter": false,
"patching_thresholds_str": null,
"tie_local_encoder_decoder": false,
"encoder_preds_low_entropy_toks": null,
"encoder_preds_random_toks": null,
"dim_token_emb": null,
"dim_patch_emb": null,
"encoder_ngram_table_dir": null,
"encoder_ngram_to_size_str": null,
"entropy_model_checkpoint_dir": null,
"entropy_model_is_ngram_model": false,
"downsampling_by_pooling": "max",
"n_heads_global": 32,
"n_heads_local_decoder": 20,
"n_heads_local_encoder": 20,
"n_kv_heads_global": null,
"conv_kernel_size": null,
"local_attention_window_len": 512,
"sequence_parallel": false,
"loss_parallel": false,
"fuse_sequence_parallel": false,
"use_fsdp": true,
"attn_to_keep": "all",
"pm_size": 0,
"full_logging_n_layers": 4
},
"entropy_model": null,
"train_entropy_model": false,
"distributed": {
"dp_shard": 1,
"dp_replicate": 256,
"tp_size": 1,
"selective_activation_checkpointing": true,
"compile": false,
"fsdp_type": "full_shard",
"model_dtype": "bf16",
"float8_recipe": null,
"float8_filter": "layers\\.[0-9]+\\.",
"matmul_allow_tf32": false,
"allow_bf16_reduced_precision_reduction": true,
"detect_anomaly": false,
"compile_cache_size_limit": 8,
"spawn_method": "forkserver"
},
"env": {
"MKL_SERVICE_FORCE_INTEL": "GNU",
"OMP_NUM_THREADS": "1",
"MKL_NUM_THREADS": "1",
"ENABLE_INTRA_NODE_COMM": "1",
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
"NCCL_IB_TIMEOUT": "22",
"NCCL_DEBUG": "INFO",
"TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"
},
"checkpoint": {
"dump": {
"every": 1000,
"keep": 1
},
"eval": {
"every": 100000,
"keep": -1
},
"path": "/checkpoints/blt_7b",
"init_ckpt_path": null,
"continue_training_from_init": false,
"s3_profile": null
},
"profiling": {
"run": false,
"trace_folder": "profiling",
"mem_warmup": 0,
"mem_steps": 4,
"profile_warmup": 100,
"profile_steps": 4
},
"logging": {
"freq": 10,
"acc_freq": null,
"wandb": {
"job_type": "train",
"dir": null,
"project": "blt",
"entity": "blt",
"tags": null,
"group": null,
"name": "blt_7b",
"notes": null,
"config_exclude_keys": null,
"config_include_keys": null,
"anonymous": null,
"mode": null,
"allow_val_change": null,
"resume": null,
"force": null,
"tensorboard": null,
"sync_tensorboard": null,
"monitor_gym": null,
"save_code": null,
"id": null,
"fork_from": null,
"resume_from": null
}
},
"async_eval_gpus": null,
"eval": null,
"eval_on_gpus": 8
}