Create tokenization_octagon.py
Browse files- tokenization_octagon.py +54 -0
tokenization_octagon.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PreTrainedTokenizerFast
|
2 |
+
from tokenizers import Tokenizer, normalizers, pre_tokenizers, trainers, models
|
3 |
+
from tokenizers.normalizers import Lowercase, NFD, StripAccents
|
4 |
+
from tokenizers.pre_tokenizers import Whitespace
|
5 |
+
from typing import Optional, List, Union
|
6 |
+
|
7 |
+
class OctagonTokenizer(PreTrainedTokenizerFast):
|
8 |
+
def __init__(
|
9 |
+
self,
|
10 |
+
vocab_file=None,
|
11 |
+
merges_file=None,
|
12 |
+
tokenizer_file=None,
|
13 |
+
unk_token="[UNK]",
|
14 |
+
sep_token="[SEP]",
|
15 |
+
pad_token="[PAD]",
|
16 |
+
cls_token="[CLS]",
|
17 |
+
mask_token="[MASK]",
|
18 |
+
**kwargs
|
19 |
+
):
|
20 |
+
super().__init__(
|
21 |
+
tokenizer_file=tokenizer_file,
|
22 |
+
unk_token=unk_token,
|
23 |
+
sep_token=sep_token,
|
24 |
+
pad_token=pad_token,
|
25 |
+
cls_token=cls_token,
|
26 |
+
mask_token=mask_token,
|
27 |
+
**kwargs
|
28 |
+
)
|
29 |
+
|
30 |
+
@classmethod
|
31 |
+
def train_tokenizer(cls, texts: List[str], vocab_size: int = 30522, save_path: Optional[str] = None):
|
32 |
+
# Initialize a tokenizer
|
33 |
+
tokenizer = Tokenizer(models.BPE())
|
34 |
+
|
35 |
+
# Normalizer
|
36 |
+
tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
|
37 |
+
|
38 |
+
# Pre-tokenizer
|
39 |
+
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
|
40 |
+
|
41 |
+
# Trainer
|
42 |
+
trainer = trainers.BpeTrainer(
|
43 |
+
vocab_size=vocab_size,
|
44 |
+
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
45 |
+
)
|
46 |
+
|
47 |
+
# Train the tokenizer
|
48 |
+
tokenizer.train_from_iterator(texts, trainer=trainer)
|
49 |
+
|
50 |
+
# Save if path is provided
|
51 |
+
if save_path:
|
52 |
+
tokenizer.save(save_path)
|
53 |
+
|
54 |
+
return cls(tokenizer_file=save_path) if save_path else cls(tokenizer_object=tokenizer)
|