# Hyperparameters and config settings EMBED_DIM = 256 # Size of token embeddings NUM_HEADS = 8 # Number of attention heads NUM_LAYERS = 4 # Number of transformer blocks FF_DIM = 512 # Feedforward layer dimension MAX_SEQ_LEN = 256 # Maximum sequence length VOCAB_SIZE = 100 # Placeholder (will be overridden based on dataset) ADAPTER_DIM = 32 # Add in adapter for continual learning