LightFury9 commited on
Commit
fc07bd1
·
verified ·
1 Parent(s): 1b8622a

Upload 3 files

Browse files
tenglish_arcade.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
tokenization_arcade100k.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+
3
+ """Tokenization classes for Tenglish Arcade."""
4
+
5
+ import base64
6
+ import os
7
+ import unicodedata
8
+ from typing import Collection, Dict, List, Set, Tuple, Union
9
+
10
+ import tiktoken
11
+ from transformers.utils import logging
12
+ from transformers import PreTrainedTokenizer, AddedToken
13
+
14
+ logger = logging.get_logger(__name__)
15
+
16
+ VOCAB_FILES_NAMES = {"vocab_file": "tenglish_arcade.tiktoken"}
17
+ NAME = "tenglish_arcade"
18
+
19
+
20
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
21
+ with open(tiktoken_bpe_file, "rb") as f:
22
+ contents = f.read()
23
+ return {
24
+ base64.b64decode(token): int(rank)
25
+ for token, rank in (line.split() for line in contents.splitlines() if line)
26
+ }
27
+
28
+
29
+ ENDOFTEXT = "<|endoftext|>"
30
+ FIM = [
31
+ "<|fim_prefix|>",
32
+ "<|fim_middle|>",
33
+ "<|fim_suffix|>",
34
+ "<|fim_pad|>",
35
+ ]
36
+ # `StarCoder` Tokens
37
+ CODE = [
38
+ "<gh_stars>",
39
+ "<filename>",
40
+ "<issue_start>",
41
+ "<issue_comment>",
42
+ "<issue_closed>",
43
+ "<jupyter_start>",
44
+ "<jupyter_text>",
45
+ "<jupyter_code>",
46
+ "<jupyter_output>",
47
+ "<empty_output>",
48
+ "<commit_before>",
49
+ "<commit_msg>",
50
+ "<commit_after>",
51
+ "<reponame>",
52
+ ]
53
+ CHAT = [
54
+ "<|im_start|>", # Chat: Input message start
55
+ "<|im_end|>", # Chat: Input message end
56
+ ]
57
+ PAUSE = "<|pause|>" # Think before you speak (https://arxiv.org/abs/2310.02226)
58
+ REGISTERS = [
59
+ f"<|reg{i}|>" for i in range(0, 8)
60
+ ] # Register 0 sink token (https://arxiv.org/abs/2309.17453)
61
+ ENDOFPROMPT = "<|endofprompt|>"
62
+ SPECIAL_TOKENS_NAMES = (
63
+ [ENDOFTEXT]
64
+ + FIM
65
+ + CODE
66
+ + [ENDOFPROMPT]
67
+ + CHAT
68
+ + [PAUSE]
69
+ + REGISTERS
70
+ + ["<|extra0|>"]
71
+ )
72
+ START_ID = 114797 + 1 # 100257
73
+ SPECIAL_TOKENS = {t: START_ID + i for i, t in enumerate(SPECIAL_TOKENS_NAMES)}
74
+
75
+
76
+ def _arcade100k(vocab_file: str):
77
+ mergeable_ranks = _load_tiktoken_bpe(vocab_file)
78
+
79
+ return {
80
+ "name": NAME,
81
+ "pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
82
+ "mergeable_ranks": mergeable_ranks,
83
+ "special_tokens": SPECIAL_TOKENS,
84
+ }
85
+
86
+
87
+ class Arcade100kTokenizer(PreTrainedTokenizer):
88
+ """
89
+ Construct a Arcade100k tokenizer backed by `tiktoken`.
90
+
91
+ Args:
92
+ vocab_file (`str`):
93
+ Path to the vocabulary file.
94
+ errors (`str`, *optional*, defaults to `"replace"`):
95
+ How to handle errors in decoding UTF-8 byte sequences.
96
+ WARNING: the default behaviour of this function is lossy, since decoded bytes are not
97
+ guaranteed to be valid UTF-8. You can control this behaviour using the `errors` parameter,
98
+ for instance, setting `errors=strict`.
99
+ """
100
+
101
+ vocab_files_names = VOCAB_FILES_NAMES
102
+ model_input_names = ["input_ids", "attention_mask"]
103
+
104
+ def __init__(
105
+ self,
106
+ vocab_file: str,
107
+ errors: str = "replace",
108
+ **kwargs,
109
+ ):
110
+ super().__init__(errors=errors, **kwargs)
111
+ self.errors = errors
112
+
113
+ self._tiktoken_config = _arcade100k(vocab_file)
114
+ self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
115
+
116
+ # TODO: Remove this assertion
117
+ assert (
118
+ len(self.tokenizer._mergeable_ranks)
119
+ + len(self.tokenizer._special_tokens)
120
+ + 1
121
+ == self.tokenizer.n_vocab
122
+ ), f"{len(self.tokenizer._mergeable_ranks) + len(self.tokenizer._special_tokens)} != {self.tokenizer.n_vocab} in encoding"
123
+
124
+ self.decoder = {i: n for n, i in self.tokenizer._mergeable_ranks.items()}
125
+ self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})
126
+ # Provide default `eos_token` and `pad_token`
127
+ if self.eos_token is None:
128
+ self.eos_token = self.decoder[self.tokenizer.eot_token]
129
+ if self.pad_token is None:
130
+ self.pad_token = self.decoder[self.tokenizer.pad_token]
131
+
132
+ # Expose for convenience
133
+ self.mergeable_ranks = self.tokenizer._mergeable_ranks
134
+ self.special_tokens = self.tokenizer._special_tokens
135
+
136
+ def __len__(self):
137
+ return self.tokenizer.n_vocab
138
+
139
+ def __getstate__(self):
140
+ # Required for `pickle` support
141
+ state = self.__dict__.copy()
142
+ del state["tokenizer"]
143
+ return state
144
+
145
+ def __setstate__(self, state):
146
+ self.__dict__.update(state)
147
+ self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
148
+
149
+ @property
150
+ def vocab_size(self):
151
+ return self.tokenizer.n_vocab
152
+
153
+ def get_vocab(self) -> Dict[bytes, int]:
154
+ return self.tokenizer._mergeable_ranks
155
+
156
+ def convert_tokens_to_ids(
157
+ self, tokens: Union[bytes, str, List[Union[bytes, str]]]
158
+ ) -> List[int]:
159
+ ids = []
160
+ if isinstance(tokens, (str, bytes)):
161
+ if tokens in self.tokenizer._special_tokens:
162
+ return self.tokenizer._special_tokens[tokens]
163
+ else:
164
+ return self.tokenizer._mergeable_ranks.get(tokens)
165
+ for token in tokens:
166
+ if token in self.tokenizer._special_tokens:
167
+ ids.append(self.tokenizer._special_tokens[token])
168
+ else:
169
+ ids.append(self.tokenizer._mergeable_ranks.get(token))
170
+ return ids
171
+
172
+ def _add_tokens(
173
+ self,
174
+ new_tokens: Union[List[str], List[AddedToken]],
175
+ special_tokens: bool = False,
176
+ ) -> int:
177
+ if not special_tokens and new_tokens:
178
+ raise ValueError("Adding regular tokens is not supported")
179
+ for token in new_tokens:
180
+ surface_form = token.content if isinstance(token, AddedToken) else token
181
+ if surface_form not in SPECIAL_TOKENS:
182
+ raise ValueError("Adding unknown special tokens is not supported")
183
+ return 0
184
+
185
+ def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
186
+ """
187
+ Save only the vocabulary of the tokenizer (vocabulary).
188
+
189
+ Returns:
190
+ `Tuple(str)`: Paths to the files saved.
191
+ """
192
+ file_path = os.path.join(save_directory, "arcade100k.tiktoken")
193
+ with open(file_path, "w", encoding="utf8") as w:
194
+ for k, v in self.tokenizer._mergeable_ranks.items():
195
+ line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
196
+ w.write(line)
197
+ return (file_path,)
198
+
199
+ def tokenize(
200
+ self,
201
+ text: str,
202
+ allowed_special: Union[Set, str] = "all",
203
+ disallowed_special: Union[Collection, str] = (),
204
+ **kwargs,
205
+ ) -> List[Union[bytes, str]]:
206
+ """
207
+ Converts a string in a sequence of tokens.
208
+
209
+ Args:
210
+ text (`str`):
211
+ The sequence to be encoded.
212
+ allowed_special (`Literal["all"]` or `set`):
213
+ The surface forms of the tokens to be encoded as special tokens in regular texts.
214
+ Default to "all".
215
+ disallowed_special (`Literal["all"]` or `Collection`):
216
+ The surface forms of the tokens that should not be in regular texts and trigger errors.
217
+ Default to an empty tuple.
218
+
219
+ kwargs (additional keyword arguments, *optional*):
220
+ Will be passed to the underlying model specific encode method.
221
+
222
+ Returns:
223
+ `List[bytes|str]`: The list of tokens.
224
+ """
225
+ tokens = []
226
+ text = unicodedata.normalize("NFC", text)
227
+
228
+ # this implementation takes a detour: text -> token id -> token surface forms
229
+ for t in self.tokenizer.encode(
230
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
231
+ ):
232
+ tokens.append(self.decoder[t])
233
+ return tokens
234
+
235
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
236
+ """
237
+ Converts a sequence of tokens in a single string.
238
+ """
239
+ text = ""
240
+ temp = b""
241
+ for t in tokens:
242
+ if isinstance(t, str):
243
+ if temp:
244
+ text += temp.decode("utf-8", errors=self.errors)
245
+ temp = b""
246
+ text += t
247
+ elif isinstance(t, bytes):
248
+ temp += t
249
+ else:
250
+ raise TypeError("token should only be of type types or str")
251
+ if temp:
252
+ text += temp.decode("utf-8", errors=self.errors)
253
+ return text
254
+
255
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
256
+ """Converts an id to a token, special tokens included"""
257
+ if index in self.decoder:
258
+ return self.decoder[index]
259
+ raise ValueError("unknown ids")
260
+
261
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
262
+ """Converts a token to an id using the vocab, special tokens included"""
263
+ if token in self.tokenizer._special_tokens:
264
+ return self.tokenizer._special_tokens[token]
265
+ if token in self.tokenizer._mergeable_ranks:
266
+ return self.tokenizer._mergeable_ranks[token]
267
+ raise ValueError("unknown token")
268
+
269
+ def _tokenize(self, text: str, **kwargs):
270
+ """
271
+ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
272
+ vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
273
+
274
+ Do NOT take care of added tokens.
275
+ """
276
+ raise NotImplementedError
277
+
278
+ def _decode(
279
+ self,
280
+ token_ids: Union[int, List[int]],
281
+ skip_special_tokens: bool = False,
282
+ errors: str = None,
283
+ **kwargs,
284
+ ) -> str:
285
+ if isinstance(token_ids, int):
286
+ token_ids = [token_ids]
287
+ if skip_special_tokens:
288
+ token_ids = [i for i in token_ids if i < self.tokenizer.eot_token]
289
+ return self.tokenizer.decode(token_ids)
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "Arcade100kTokenizer",
3
+ "auto_map": {
4
+ "AutoTokenizer": [
5
+ "tokenization_arcade100k.Arcade100kTokenizer",
6
+ null
7
+ ]
8
+ },
9
+ "eos_token": "<|endoftext|>",
10
+ "pad_token": "<|endoftext|>"
11
+ }