luisra commited on
Commit
ab8cd14
·
verified ·
1 Parent(s): 944a6cf

Uploading tokenization_kimi.py

Browse files
Files changed (1) hide show
  1. tokenization_kimi.py +323 -0
tokenization_kimi.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tiktoken
3
+
4
+ from logging import getLogger
5
+ from pathlib import Path
6
+ from typing import (
7
+ cast,
8
+ Tuple,
9
+ Dict,
10
+ Iterator,
11
+ List,
12
+ Union,
13
+ Optional,
14
+ )
15
+ from shutil import copyfile
16
+ from tiktoken.load import load_tiktoken_bpe
17
+ from tokenizers import AddedToken, pre_tokenizers, Regex
18
+ from transformers.tokenization_utils import PreTrainedTokenizer
19
+ from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
20
+
21
+
22
+
23
+ logger = getLogger(__name__)
24
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
25
+
26
+ class TikTokenTokenizer(PreTrainedTokenizer):
27
+ """
28
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
29
+
30
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
31
+ this superclass for more information regarding those methods.
32
+
33
+ Args:
34
+ vocab_file (`str`):
35
+ The path to the Tiktoken model file.
36
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
37
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
38
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
39
+ The end of sequence token.
40
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
41
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
42
+ token instead. The second to last item in special_tokens.
43
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
44
+ The token used for padding, for example when batching sequences of different lengths.
45
+ additional_special_tokens (list of `str`, *optional*):
46
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
47
+ skipped when decoding if `skip_special_tokens` is set to `True`.
48
+ """
49
+
50
+ vocab_files_names = VOCAB_FILES_NAMES
51
+
52
+ model_input_names = ["input_ids", "attention_mask"]
53
+
54
+ special_tokens: Dict[str, int]
55
+
56
+ num_reserved_special_tokens = 256
57
+
58
+ pat_str = "|".join(
59
+ [
60
+ r"""[\p{Han}]+""",
61
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
62
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
63
+ r"""\p{N}{1,3}""",
64
+ r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
65
+ r"""\s*[\r\n]+""",
66
+ r"""\s+(?!\S)""",
67
+ r"""\s+""",
68
+ ]
69
+ )
70
+
71
+ def __init__(
72
+ self,
73
+ vocab_file,
74
+ bos_token: Union[str, AddedToken]="[BOS]",
75
+ eos_token: Union[str, AddedToken]="[EOS]",
76
+ unk_token: Union[str, AddedToken, None]=None,
77
+ pad_token: Union[str, AddedToken, None]=None,
78
+ additional_special_tokens: List[str]=None,
79
+ added_tokens_decoder: Optional[dict] = None,
80
+ **kwargs,
81
+ ):
82
+ assert os.path.isfile(vocab_file), vocab_file
83
+
84
+ if additional_special_tokens is None:
85
+ additional_special_tokens = [
86
+ "<|im_end|>",
87
+ "<|im_user|>",
88
+ "<|im_assistant|>",
89
+ "<|start_header_id|>",
90
+ "<|end_header_id|>",
91
+ "[EOT]",
92
+ "<|im_system|>",
93
+ "<|im_middle|>",
94
+ ]
95
+
96
+ special_tokens_mapping = {
97
+ i: added_tokens_decoder[i].content for i in added_tokens_decoder
98
+ }
99
+
100
+ self.vocab_file = vocab_file
101
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
102
+ num_base_tokens = len(mergeable_ranks)
103
+ self.special_tokens = {
104
+ special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
105
+ for i in range(
106
+ num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2
107
+ )
108
+ }
109
+
110
+
111
+
112
+ self.model = tiktoken.Encoding(
113
+ name=Path(vocab_file).name,
114
+ pat_str=self.pat_str,
115
+ mergeable_ranks=mergeable_ranks,
116
+ special_tokens=self.special_tokens,
117
+ )
118
+ logger.info(f"Reloaded tiktoken model from {vocab_file}")
119
+
120
+ self.n_words: int = self.model.n_vocab
121
+ # BOS / EOS token IDs
122
+ self.bos_id: int = self.special_tokens[str(bos_token)]
123
+ self.eos_id: int = self.special_tokens[str(eos_token)]
124
+ logger.info(
125
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
126
+ )
127
+
128
+ self.pad_id: int = self.special_tokens[str(pad_token)]
129
+ self.unk_id: int = self.special_tokens[str(unk_token)]
130
+
131
+ self.byte_encoder = bytes_to_unicode()
132
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
133
+
134
+ self.decoder = {}
135
+ for i in range(self.n_words):
136
+ # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
137
+ decoding = ''.join([
138
+ self.byte_encoder[ord(char)] for char in
139
+ self.model.decode_single_token_bytes(i).decode('latin-1')
140
+ ])
141
+ self.decoder[i] = decoding
142
+
143
+ self.encoder = {}
144
+ for i in range(self.n_words):
145
+ if i in self.decoder:
146
+ self.encoder[self.decoder[i]] = i
147
+
148
+ super().__init__(
149
+ bos_token=bos_token,
150
+ eos_token=eos_token,
151
+ unk_token=unk_token,
152
+ pad_token=pad_token,
153
+ additional_special_tokens=additional_special_tokens,
154
+ **kwargs,
155
+ )
156
+ self.all_special_ids_set = set(self.all_special_ids)
157
+
158
+ def encode(
159
+ self,
160
+ text: str,
161
+ allow_special_tokens: bool = True,
162
+ **kwargs
163
+ ) -> List[int]:
164
+ """
165
+ Encodes a string into a list of token IDs.
166
+
167
+ Args:
168
+ text (str): The input string to be encoded.
169
+
170
+ Returns:
171
+ list[int]: A list of token IDs.
172
+ """
173
+ # If there are other args, we should call super().encode because there are a lot of code
174
+ # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
175
+ # NOTE: our encode method is not compatible with the super().encode method,
176
+ # e.g. split_special_tokens' default is True in our encode method.
177
+ if len(kwargs) > 0:
178
+ logger.warning( f"Calling super().encode with {kwargs}" )
179
+ return super().encode(text, **kwargs)
180
+
181
+ assert type(text) is str
182
+
183
+ # The tiktoken tokenizer can handle <=400k chars without
184
+ # pyo3_runtime.PanicException.
185
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
186
+
187
+ # https://github.com/openai/tiktoken/issues/195
188
+ # Here we iterate over subsequences and split if we exceed the limit
189
+ # of max consecutive non-whitespace or whitespace characters.
190
+ MAX_NO_WHITESPACES_CHARS = 25_000
191
+
192
+ texts = self.pre_tokenizer_process(text)
193
+
194
+ all_substrs = []
195
+ for text in texts:
196
+ substrs = (
197
+ substr
198
+ for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
199
+ for substr in self._split_whitespaces_or_nonwhitespaces(
200
+ text[i: i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
201
+ )
202
+ )
203
+ all_substrs.extend(substrs)
204
+
205
+ t: List[int] = []
206
+ for substr in all_substrs:
207
+ if allow_special_tokens:
208
+ t.extend(
209
+ # we should consider special token as a common token
210
+ self.model.encode(
211
+ substr,
212
+ allowed_special="all",
213
+ )
214
+ )
215
+ else:
216
+ t.extend(
217
+ # we should consider special token as a common token
218
+ self.model.encode(
219
+ substr,
220
+ disallowed_special=(),
221
+ )
222
+ )
223
+
224
+ return t
225
+
226
+ def decode(
227
+ self,
228
+ token_ids: Union[int, List[int]],
229
+ **kwargs
230
+ ) -> str:
231
+ """
232
+ Decodes a list of token IDs into a string.
233
+
234
+ Args:
235
+ token_ids (List[int]): The list of token IDs to be decoded.
236
+
237
+ Returns:
238
+ str: The decoded string.
239
+ """
240
+ # If there are other args, we should call super().decode because there are a lot of code
241
+ # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
242
+ if len(kwargs) > 0:
243
+ return super().decode(token_ids, **kwargs)
244
+
245
+ if type(token_ids) is int:
246
+ token_ids = [token_ids]
247
+
248
+ return self.model.decode(cast(List[int], token_ids))
249
+
250
+ @staticmethod
251
+ def _split_whitespaces_or_nonwhitespaces(
252
+ s: str, max_consecutive_slice_len: int
253
+ ) -> Iterator[str]:
254
+ """
255
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
256
+ consecutive whitespaces or consecutive non-whitespaces.
257
+ """
258
+ current_slice_len = 0
259
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
260
+ slice_start = 0
261
+
262
+ for i in range(len(s)):
263
+ is_now_space = s[i].isspace()
264
+
265
+ if current_slice_is_space ^ is_now_space:
266
+ current_slice_len = 1
267
+ current_slice_is_space = is_now_space
268
+ else:
269
+ current_slice_len += 1
270
+ if current_slice_len > max_consecutive_slice_len:
271
+ yield s[slice_start:i]
272
+ slice_start = i
273
+ current_slice_len = 1
274
+ yield s[slice_start:]
275
+
276
+ def pre_tokenizer_process(self, text: str) -> List[str]:
277
+ """
278
+ pre-tokenizes the input text into a list of tokens.
279
+ This method is used to split the input text into smaller chunks for internal processing.
280
+ """
281
+ return [text]
282
+
283
+
284
+ """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
285
+ @property
286
+ def vocab_size(self) -> int:
287
+ return self.n_words
288
+
289
+ def get_vocab(self) -> Dict[str, int]:
290
+ return self.encoder
291
+
292
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
293
+ return [
294
+ self.decoder[t]
295
+ for t in self.encode(text)
296
+ ]
297
+
298
+ def _convert_token_to_id(self, token: str) -> int:
299
+ return self.encoder.get(token, self.unk_id)
300
+
301
+ def _convert_id_to_token(self, index: int) -> str:
302
+ return self.decoder.get(index)
303
+
304
+ @staticmethod
305
+ def clean_up_tokenization(out_string: str) -> str:
306
+ return out_string
307
+
308
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
309
+ text = ''.join(tokens)
310
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', 'replace')
311
+ return text
312
+
313
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
314
+ if not os.path.isdir(save_directory):
315
+ raise ValueError(f"vocabulary path ({save_directory}) should be a directory")
316
+ out_vocab_file = os.path.join(
317
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
318
+ )
319
+
320
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
321
+ copyfile(self.vocab_file, out_vocab_file)
322
+
323
+ return (out_vocab_file,)