|
from transformers import PreTrainedTokenizerFast |
|
from tokenizers import Tokenizer, normalizers, pre_tokenizers, trainers, models |
|
from tokenizers.normalizers import Lowercase, NFD, StripAccents |
|
from tokenizers.pre_tokenizers import Whitespace |
|
from typing import Optional, List, Union |
|
|
|
class OctagonTokenizer(PreTrainedTokenizerFast): |
|
def __init__( |
|
self, |
|
vocab_file=None, |
|
merges_file=None, |
|
tokenizer_file=None, |
|
unk_token="[UNK]", |
|
sep_token="[SEP]", |
|
pad_token="[PAD]", |
|
cls_token="[CLS]", |
|
mask_token="[MASK]", |
|
**kwargs |
|
): |
|
super().__init__( |
|
tokenizer_file=tokenizer_file, |
|
unk_token=unk_token, |
|
sep_token=sep_token, |
|
pad_token=pad_token, |
|
cls_token=cls_token, |
|
mask_token=mask_token, |
|
**kwargs |
|
) |
|
|
|
@classmethod |
|
def train_tokenizer(cls, texts: List[str], vocab_size: int = 30522, save_path: Optional[str] = None): |
|
|
|
tokenizer = Tokenizer(models.BPE()) |
|
|
|
|
|
tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) |
|
|
|
|
|
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() |
|
|
|
|
|
trainer = trainers.BpeTrainer( |
|
vocab_size=vocab_size, |
|
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] |
|
) |
|
|
|
|
|
tokenizer.train_from_iterator(texts, trainer=trainer) |
|
|
|
|
|
if save_path: |
|
tokenizer.save(save_path) |
|
|
|
return cls(tokenizer_file=save_path) if save_path else cls(tokenizer_object=tokenizer) |