Vadim21221
/

sae

Model card Files Files and versions Community

sae / pile_mul_fractal_2_8 /config.json

Vadim21221's picture

Upload 6 files

970d7a4 verified about 1 month ago

history blame contribute delete

1.55 kB

	{
	"model_name": "google/gemma-2-2b",
	"layer": 12,
	"hook_point": "resid_post",
	"act_size": 2304,
	"sae_type": "mul_fractal_topk",
	"dict_size": 65536,
	"aux_penalty": 0.03125,
	"input_unit_norm": true,
	"batch_norm_on_queries": false,
	"affine_batch_norm": false,
	"linear_heads": 0,
	"topk2": 50,
	"topk1": 50,
	"topk2_warmup_steps_fraction": 0.0,
	"start_topk2": 50,
	"topk1_warmup_steps_fraction": 0.0,
	"start_topk1": 50,
	"topk2_aux": 512,
	"cartesian_op": "mul",
	"router_depth": 2,
	"router_tree_width": null,
	"num_mkeys": 2,
	"num_nkeys": 8,
	"num_heads": 4096,
	"n_batches_to_dead": 10,
	"lr": 0.0008,
	"bandwidth": 0.001,
	"l1_coeff": 0.0018,
	"num_tokens": 799634235,
	"seq_len": 1024,
	"model_batch_size": 64,
	"num_batches_in_buffer": 5,
	"max_grad_norm": 1.0,
	"batch_size": 8192,
	"weight_decay": 0.0,
	"warmup_fraction": 0.1,
	"scheduler_type": "cosine_with_min_lr",
	"device": "cuda",
	"dtype": "torch.float32",
	"sae_dtype": "torch.float32",
	"dataset_path": "cerebras/SlimPajama-627B",
	"wandb_project": "turbo-llama-lens",
	"enable_wandb": true,
	"sae_name": "sae",
	"seed": 42,
	"performance_log_steps": 100,
	"save_checkpoint_steps": 15000000,
	"wandb_run_suffix": "exp80_bench",
	"sweep_pair": "{'dict_size': 65536, 'num_heads': 4096, 'num_mkeys': 2, 'num_nkeys': 8, 'num_tokens': 799634235, 'sae_type': 'mul_fractal_topk', 'start_topk1': 50, 'start_topk2': 50, 'topk1': 50}"
	}