blt-7b / params.json

Duplicate from Pclanglais/blt-7b

e384325 verified 3 months ago

7.97 kB

	{
	"name": "blt_7b",
	"dump_dir": "/checkpoints/blt_7b",
	"seed": 42,
	"debug_dynamo": false,
	"grad_acc_steps": 1,
	"gc_collect_freq": 1000,
	"probe_freq": null,
	"steps": 240000,
	"max_steps": null,
	"data": {
	"s3_profile": "blt",
	"batch_size": 4,
	"seq_len": 4096,
	"seed": 42,
	"add_bos": true,
	"add_eos": true,
	"load_async": true,
	"async_persist_type": "approximate",
	"prefetch_size": 200,
	"preprocess_dir": "/corpora/entropy_preprocess",
	"dataset_files": null,
	"entropy_model_name": "transformer_100m",
	"arrow_batch_size": 20,
	"buffer_size": 512,
	"file_format": "arrow",
	"pad_to_max_length": true,
	"max_encoder_seq_length": 24576,
	"enable_byte_ngrams": false,
	"add_patches": true,
	"tokenizer_args": {
	"name": "blt",
	"init_kwargs": {
	"bpe_tokenizer_path": "/tokenizers/tokenizer_final_32k.minus_inf_ws.model"
	}
	},
	"patcher_args": {
	"patching_mode": "entropy",
	"patching_device": "cuda",
	"entropy_model_checkpoint_dir": null,
	"realtime_patching": false,
	"threshold": 1.335442066192627,
	"threshold_add": null,
	"max_patch_length": null,
	"patch_size": 4.5,
	"patching_batch_size": 1,
	"device": "cuda",
	"monotonicity": false,
	"log_time": false
	}
	},
	"optim": {
	"lr": 0.0004,
	"weight_decay": 0.1,
	"epsilon": 1e-08,
	"beta1": 0.9,
	"beta2": 0.95,
	"clip": 1.0,
	"scheduler": "cosine",
	"warmup": 2000,
	"lr_min_ratio": 0.01,
	"cycle_length": 1.0,
	"cosine_theta": 1.0,
	"annealing_step": 1000,
	"decay_fraction": 0.1,
	"exp_factor": 0.5
	},
	"model": {
	"dim": 512,
	"n_layers": 8,
	"head_dim": null,
	"n_heads": 8,
	"n_kv_heads": null,
	"ffn_dim_multiplier": 1.0,
	"multiple_of": 256,
	"norm_eps": 1e-05,
	"rope_theta": 500000.0,
	"rope_use_fp32_in_outer_product": true,
	"init_base_std": null,
	"init_std_factor": "current_depth",
	"max_seqlen": 4096,
	"attn_impl": "xformers",
	"attn_bias_type": "block_causal",
	"eos_id": 2,
	"seed": 42,
	"vocab_size": 260,
	"weight_tying": false,
	"patch_in_forward": true,
	"dim_token": null,
	"dim_global": 4096,
	"dim_local_decoder": 1280,
	"dim_local_encoder": 1280,
	"n_layers_global": 32,
	"n_layers_local_decoder": 6,
	"n_layers_local_encoder": 1,
	"patch_size": 4.5,
	"patching_mode": "entropy",
	"patching_threshold": 1.335442066192627,
	"patching_threshold_add": null,
	"monotonicity": false,
	"patching_batch_size": 1,
	"patching_device": "cuda",
	"max_patch_length": null,
	"tie_local_encoder_decoder_logits": false,
	"use_local_encoder_transformer": true,
	"encoder_lm_loss": false,
	"max_encoder_seq_length": 24576,
	"pad_to_max_length": true,
	"encoder_enable_byte_ngrams": false,
	"encoder_enable_byte_group_hash": false,
	"ngram_vocab_sizes": null,
	"cross_attn_encoder": true,
	"cross_attn_decoder": true,
	"cross_attn_window_encoder": null,
	"cross_attn_window_decoder": null,
	"cross_attn_k": 4,
	"cross_attn_nheads": 20,
	"cross_attn_all_layers_decoder": true,
	"cross_attn_all_layers_encoder": false,
	"cross_attn_use_flex_attention": true,
	"cross_attn_init_by_pooling": true,
	"encoder_hash_byte_group_size": [
	3,
	4,
	5,
	6,
	7,
	8
	],
	"encoder_hash_byte_group_vocab": 500002,
	"encoder_hash_byte_group_nb_functions": 1,
	"log_patch_lengths": false,
	"non_linearity": "swiglu",
	"use_rope": true,
	"recompute_fc1_out": false,
	"recompute_fc3_out": false,
	"recompute_attn": false,
	"custom_bwd": false,
	"layer_ckpt": "none",
	"init_use_gaussian": true,
	"init_use_depth": "current",
	"alpha_depth": "disabled",
	"max_length": 4096,
	"norm_affine": true,
	"pre_norm": true,
	"norm_type": "rmsnorm",
	"dropout": 0.0,
	"output_size": -1,
	"architecture": "vanilla",
	"share_encoder_decoder_emb": true,
	"global_local_decoder_residual_layer": null,
	"tokenize_with_bpe_delimiter": false,
	"patching_thresholds_str": null,
	"tie_local_encoder_decoder": false,
	"encoder_preds_low_entropy_toks": null,
	"encoder_preds_random_toks": null,
	"dim_token_emb": null,
	"dim_patch_emb": null,
	"encoder_ngram_table_dir": null,
	"encoder_ngram_to_size_str": null,
	"entropy_model_checkpoint_dir": null,
	"entropy_model_is_ngram_model": false,
	"downsampling_by_pooling": "max",
	"n_heads_global": 32,
	"n_heads_local_decoder": 20,
	"n_heads_local_encoder": 20,
	"n_kv_heads_global": null,
	"conv_kernel_size": null,
	"local_attention_window_len": 512,
	"sequence_parallel": false,
	"loss_parallel": false,
	"fuse_sequence_parallel": false,
	"use_fsdp": true,
	"attn_to_keep": "all",
	"pm_size": 0,
	"full_logging_n_layers": 4
	},
	"entropy_model": null,
	"train_entropy_model": false,
	"distributed": {
	"dp_shard": 1,
	"dp_replicate": 256,
	"tp_size": 1,
	"selective_activation_checkpointing": true,
	"compile": false,
	"fsdp_type": "full_shard",
	"model_dtype": "bf16",
	"float8_recipe": null,
	"float8_filter": "layers\\.[0-9]+\\.",
	"matmul_allow_tf32": false,
	"allow_bf16_reduced_precision_reduction": true,
	"detect_anomaly": false,
	"compile_cache_size_limit": 8,
	"spawn_method": "forkserver"
	},
	"env": {
	"MKL_SERVICE_FORCE_INTEL": "GNU",
	"OMP_NUM_THREADS": "1",
	"MKL_NUM_THREADS": "1",
	"ENABLE_INTRA_NODE_COMM": "1",
	"TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
	"NCCL_IB_TIMEOUT": "22",
	"NCCL_DEBUG": "INFO",
	"TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"
	},
	"checkpoint": {
	"dump": {
	"every": 1000,
	"keep": 1
	},
	"eval": {
	"every": 100000,
	"keep": -1
	},
	"path": "/checkpoints/blt_7b",
	"init_ckpt_path": null,
	"continue_training_from_init": false,
	"s3_profile": null
	},
	"profiling": {
	"run": false,
	"trace_folder": "profiling",
	"mem_warmup": 0,
	"mem_steps": 4,
	"profile_warmup": 100,
	"profile_steps": 4
	},
	"logging": {
	"freq": 10,
	"acc_freq": null,
	"wandb": {
	"job_type": "train",
	"dir": null,
	"project": "blt",
	"entity": "blt",
	"tags": null,
	"group": null,
	"name": "blt_7b",
	"notes": null,
	"config_exclude_keys": null,
	"config_include_keys": null,
	"anonymous": null,
	"mode": null,
	"allow_val_change": null,
	"resume": null,
	"force": null,
	"tensorboard": null,
	"sync_tensorboard": null,
	"monitor_gym": null,
	"save_code": null,
	"id": null,
	"fork_from": null,
	"resume_from": null
	}
	},
	"async_eval_gpus": null,
	"eval": null,
	"eval_on_gpus": 8
	}