sae / pile_mul_fractal_2_8 /config.json
Vadim21221's picture
Upload 6 files
970d7a4 verified
{
"model_name": "google/gemma-2-2b",
"layer": 12,
"hook_point": "resid_post",
"act_size": 2304,
"sae_type": "mul_fractal_topk",
"dict_size": 65536,
"aux_penalty": 0.03125,
"input_unit_norm": true,
"batch_norm_on_queries": false,
"affine_batch_norm": false,
"linear_heads": 0,
"topk2": 50,
"topk1": 50,
"topk2_warmup_steps_fraction": 0.0,
"start_topk2": 50,
"topk1_warmup_steps_fraction": 0.0,
"start_topk1": 50,
"topk2_aux": 512,
"cartesian_op": "mul",
"router_depth": 2,
"router_tree_width": null,
"num_mkeys": 2,
"num_nkeys": 8,
"num_heads": 4096,
"n_batches_to_dead": 10,
"lr": 0.0008,
"bandwidth": 0.001,
"l1_coeff": 0.0018,
"num_tokens": 799634235,
"seq_len": 1024,
"model_batch_size": 64,
"num_batches_in_buffer": 5,
"max_grad_norm": 1.0,
"batch_size": 8192,
"weight_decay": 0.0,
"warmup_fraction": 0.1,
"scheduler_type": "cosine_with_min_lr",
"device": "cuda",
"dtype": "torch.float32",
"sae_dtype": "torch.float32",
"dataset_path": "cerebras/SlimPajama-627B",
"wandb_project": "turbo-llama-lens",
"enable_wandb": true,
"sae_name": "sae",
"seed": 42,
"performance_log_steps": 100,
"save_checkpoint_steps": 15000000,
"wandb_run_suffix": "exp80_bench",
"sweep_pair": "{'dict_size': 65536, 'num_heads': 4096, 'num_mkeys': 2, 'num_nkeys': 8, 'num_tokens': 799634235, 'sae_type': 'mul_fractal_topk', 'start_topk1': 50, 'start_topk2': 50, 'topk1': 50}"
}