|
{ |
|
"model_name": "google/gemma-2-2b", |
|
"layer": 12, |
|
"hook_point": "resid_post", |
|
"act_size": 2304, |
|
"sae_type": "mul_fractal_topk", |
|
"dict_size": 65536, |
|
"aux_penalty": 0.03125, |
|
"input_unit_norm": true, |
|
"batch_norm_on_queries": false, |
|
"affine_batch_norm": false, |
|
"linear_heads": 0, |
|
"topk2": 50, |
|
"topk1": 50, |
|
"topk2_warmup_steps_fraction": 0.0, |
|
"start_topk2": 50, |
|
"topk1_warmup_steps_fraction": 0.0, |
|
"start_topk1": 50, |
|
"topk2_aux": 512, |
|
"cartesian_op": "mul", |
|
"router_depth": 2, |
|
"router_tree_width": null, |
|
"num_mkeys": 2, |
|
"num_nkeys": 8, |
|
"num_heads": 4096, |
|
"n_batches_to_dead": 10, |
|
"lr": 0.0008, |
|
"bandwidth": 0.001, |
|
"l1_coeff": 0.0018, |
|
"num_tokens": 799634235, |
|
"seq_len": 1024, |
|
"model_batch_size": 64, |
|
"num_batches_in_buffer": 5, |
|
"max_grad_norm": 1.0, |
|
"batch_size": 8192, |
|
"weight_decay": 0.0, |
|
"warmup_fraction": 0.1, |
|
"scheduler_type": "cosine_with_min_lr", |
|
"device": "cuda", |
|
"dtype": "torch.float32", |
|
"sae_dtype": "torch.float32", |
|
"dataset_path": "cerebras/SlimPajama-627B", |
|
"wandb_project": "turbo-llama-lens", |
|
"enable_wandb": true, |
|
"sae_name": "sae", |
|
"seed": 42, |
|
"performance_log_steps": 100, |
|
"save_checkpoint_steps": 15000000, |
|
"wandb_run_suffix": "exp80_bench", |
|
"sweep_pair": "{'dict_size': 65536, 'num_heads': 4096, 'num_mkeys': 2, 'num_nkeys': 8, 'num_tokens': 799634235, 'sae_type': 'mul_fractal_topk', 'start_topk1': 50, 'start_topk2': 50, 'topk1': 50}" |
|
} |