from transformers import PretrainedConfig from dataclasses import dataclass, asdict # @dataclass # class QuantNoiseConfig: # pq: float = 0.0 # pq_block_size: int = 8 @dataclass class QuantNoiseConfig: pq: float = 0.0 pq_block_size: int = 8 def to_dict(self): return asdict(self) @classmethod def from_dict(cls, data): return cls(**data) repo_name = "damilojohn/AfroLid" class AfroLidConfig(PretrainedConfig): model_type = "afrolid" def __init__(self, encoder_vocab_size=64001, decoder_vocab_size=528, embed_dim=768, ffn_dim=3072, num_heads=12, num_layers=12, max_seq_len=512, dropout=0.1, attention_dropout=0.1, activation_dropout=0.0, layerdrop=0.0, normalize_before=False, learned_pos=False, max_source_positions=1024, max_target_positions=1024, no_token_positional_embeddings=False, share_decoder_input_output_embed=True, share_all_embeddings=False, layernorm_embedding=False, checkpoint_activations=False, offload_activations=False, bias=False, **kwargs): """ AfroLid configuration class for an encoder-decoder transformer model, with support for QuantNoiseConfig. """ self.encoder_vocab_size = encoder_vocab_size self.decoder_vocab_size = decoder_vocab_size self.embed_dim = embed_dim self.ffn_dim = ffn_dim self.num_heads = num_heads self.num_layers = num_layers self.max_seq_len = max_seq_len self.dropout = dropout self.attention_dropout = attention_dropout self.activation_dropout = activation_dropout self.layerdrop = layerdrop self.normalize_before = normalize_before self.learned_pos = learned_pos self.max_source_positions = max_source_positions self.max_target_positions = max_target_positions self.no_token_positional_embeddings = no_token_positional_embeddings self.share_decoder_input_output_embed = share_decoder_input_output_embed self.share_all_embeddings = share_all_embeddings self.layernorm_embedding = layernorm_embedding self.checkpoint_activations = checkpoint_activations self.offload_activations = offload_activations self.bias = bias super().__init__(**kwargs)