Georg4000 commited on
Commit
2ae8b59
·
verified ·
1 Parent(s): e1386e9

Create tokenization_octagon.py

Browse files
Files changed (1) hide show
  1. tokenization_octagon.py +54 -0
tokenization_octagon.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedTokenizerFast
2
+ from tokenizers import Tokenizer, normalizers, pre_tokenizers, trainers, models
3
+ from tokenizers.normalizers import Lowercase, NFD, StripAccents
4
+ from tokenizers.pre_tokenizers import Whitespace
5
+ from typing import Optional, List, Union
6
+
7
+ class OctagonTokenizer(PreTrainedTokenizerFast):
8
+ def __init__(
9
+ self,
10
+ vocab_file=None,
11
+ merges_file=None,
12
+ tokenizer_file=None,
13
+ unk_token="[UNK]",
14
+ sep_token="[SEP]",
15
+ pad_token="[PAD]",
16
+ cls_token="[CLS]",
17
+ mask_token="[MASK]",
18
+ **kwargs
19
+ ):
20
+ super().__init__(
21
+ tokenizer_file=tokenizer_file,
22
+ unk_token=unk_token,
23
+ sep_token=sep_token,
24
+ pad_token=pad_token,
25
+ cls_token=cls_token,
26
+ mask_token=mask_token,
27
+ **kwargs
28
+ )
29
+
30
+ @classmethod
31
+ def train_tokenizer(cls, texts: List[str], vocab_size: int = 30522, save_path: Optional[str] = None):
32
+ # Initialize a tokenizer
33
+ tokenizer = Tokenizer(models.BPE())
34
+
35
+ # Normalizer
36
+ tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
37
+
38
+ # Pre-tokenizer
39
+ tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
40
+
41
+ # Trainer
42
+ trainer = trainers.BpeTrainer(
43
+ vocab_size=vocab_size,
44
+ special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
45
+ )
46
+
47
+ # Train the tokenizer
48
+ tokenizer.train_from_iterator(texts, trainer=trainer)
49
+
50
+ # Save if path is provided
51
+ if save_path:
52
+ tokenizer.save(save_path)
53
+
54
+ return cls(tokenizer_file=save_path) if save_path else cls(tokenizer_object=tokenizer)