from transformers import PreTrainedTokenizerFast from tokenizers import Tokenizer, normalizers, pre_tokenizers, trainers, models from tokenizers.normalizers import Lowercase, NFD, StripAccents from tokenizers.pre_tokenizers import Whitespace from typing import Optional, List, Union class OctagonTokenizer(PreTrainedTokenizerFast): def __init__( self, vocab_file=None, merges_file=None, tokenizer_file=None, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", **kwargs ): super().__init__( tokenizer_file=tokenizer_file, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs ) @classmethod def train_tokenizer(cls, texts: List[str], vocab_size: int = 30522, save_path: Optional[str] = None): # Initialize a tokenizer tokenizer = Tokenizer(models.BPE()) # Normalizer tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) # Pre-tokenizer tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() # Trainer trainer = trainers.BpeTrainer( vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] ) # Train the tokenizer tokenizer.train_from_iterator(texts, trainer=trainer) # Save if path is provided if save_path: tokenizer.save(save_path) return cls(tokenizer_file=save_path) if save_path else cls(tokenizer_object=tokenizer)