Gustavo de Rosa commited on
Commit
9936ea0
·
1 Parent(s): 64459b4

chore(root): Updates tokenizer files.

Browse files
added_tokens.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|dummy_0|>": 100256,
3
+ "<|endoftext|>": 100257,
4
+ "<|fim_prefix|>": 100258,
5
+ "<|fim_middle|>": 100259,
6
+ "<|fim_suffix|>": 100260,
7
+ "<|dummy_1|>": 100261,
8
+ "<|dummy_2|>": 100262,
9
+ "<|dummy_3|>": 100263,
10
+ "<|im_start|>": 100264,
11
+ "<|im_end|>": 100265,
12
+ "<|im_sep|>": 100266,
13
+ "<|dummy_4|>": 100267,
14
+ "<|dummy_5|>": 100268,
15
+ "<|dummy_6|>": 100269,
16
+ "<|dummy_7|>": 100270,
17
+ "<|dummy_8|>": 100271,
18
+ "<|dummy_9|>": 100272,
19
+ "<|dummy_10|>": 100273,
20
+ "<|dummy_11|>": 100274,
21
+ "<|dummy_12|>": 100275,
22
+ "<|endofprompt|>": 100276,
23
+ "<|dummy_13|>": 100277,
24
+ "<|dummy_14|>": 100278,
25
+ "<|dummy_15|>": 100279,
26
+ "<|dummy_16|>": 100280,
27
+ "<|dummy_17|>": 100281,
28
+ "<|dummy_18|>": 100282,
29
+ "<|dummy_19|>": 100283,
30
+ "<|dummy_20|>": 100284,
31
+ "<|dummy_21|>": 100285,
32
+ "<|dummy_22|>": 100286,
33
+ "<|dummy_23|>": 100287,
34
+ "<|dummy_24|>": 100288,
35
+ "<|dummy_25|>": 100289,
36
+ "<|dummy_26|>": 100290,
37
+ "<|dummy_27|>": 100291,
38
+ "<|dummy_28|>": 100292,
39
+ "<|dummy_29|>": 100293,
40
+ "<|dummy_30|>": 100294,
41
+ "<|dummy_31|>": 100295,
42
+ "<|dummy_32|>": 100296,
43
+ "<|dummy_33|>": 100297,
44
+ "<|dummy_34|>": 100298,
45
+ "<|dummy_35|>": 100299,
46
+ "<|dummy_36|>": 100300,
47
+ "<|dummy_37|>": 100301,
48
+ "<|dummy_38|>": 100302,
49
+ "<|dummy_39|>": 100303,
50
+ "<|dummy_40|>": 100304,
51
+ "<|dummy_41|>": 100305,
52
+ "<|dummy_42|>": 100306,
53
+ "<|dummy_43|>": 100307,
54
+ "<|dummy_44|>": 100308,
55
+ "<|dummy_45|>": 100309,
56
+ "<|dummy_46|>": 100310,
57
+ "<|dummy_47|>": 100311,
58
+ "<|dummy_48|>": 100312,
59
+ "<|dummy_49|>": 100313,
60
+ "<|dummy_50|>": 100314,
61
+ "<|dummy_51|>": 100315,
62
+ "<|dummy_52|>": 100316,
63
+ "<|dummy_53|>": 100317,
64
+ "<|dummy_54|>": 100318,
65
+ "<|dummy_55|>": 100319,
66
+ "<|dummy_56|>": 100320,
67
+ "<|dummy_57|>": 100321,
68
+ "<|dummy_58|>": 100322,
69
+ "<|dummy_59|>": 100323,
70
+ "<|dummy_60|>": 100324,
71
+ "<|dummy_61|>": 100325,
72
+ "<|dummy_62|>": 100326,
73
+ "<|dummy_63|>": 100327,
74
+ "<|dummy_64|>": 100328,
75
+ "<|dummy_65|>": 100329,
76
+ "<|dummy_66|>": 100330,
77
+ "<|dummy_67|>": 100331,
78
+ "<|dummy_68|>": 100332,
79
+ "<|dummy_69|>": 100333,
80
+ "<|dummy_70|>": 100334,
81
+ "<|dummy_71|>": 100335,
82
+ "<|dummy_72|>": 100336,
83
+ "<|dummy_73|>": 100337,
84
+ "<|dummy_74|>": 100338,
85
+ "<|dummy_75|>": 100339,
86
+ "<|dummy_76|>": 100340,
87
+ "<|dummy_77|>": 100341,
88
+ "<|dummy_78|>": 100342,
89
+ "<|dummy_79|>": 100343,
90
+ "<|dummy_80|>": 100344,
91
+ "<|dummy_81|>": 100345,
92
+ "<|dummy_82|>": 100346,
93
+ "<|dummy_83|>": 100347,
94
+ "<|dummy_84|>": 100348,
95
+ "<|dummy_85|>": 100349,
96
+ "<|dummy_86|>": 100350,
97
+ "<|dummy_87|>": 100351
98
+ }
cl100k_base.tiktoken DELETED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenization_phi4.py DELETED
@@ -1,306 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Tokenization classes for Phi-4."""
17
-
18
- import base64
19
- import os
20
- from functools import cached_property
21
- import re
22
- from typing import Collection, Dict, List, Optional, Set, Tuple, Union
23
-
24
- import requests
25
- import tiktoken
26
-
27
- from transformers import AddedToken, AutoConfig, PreTrainedTokenizer
28
- from transformers.models.auto.tokenization_auto import get_tokenizer_config
29
-
30
-
31
- PADDED_VOCAB_SIZE = 100352
32
- VOCAB_SIZE = 100276
33
- VOCAB_FILES_NAMES = {"vocab_file": "cl100k_base.tiktoken"}
34
-
35
- DUMMY_TOKENS = {f"<|dummy_{12 + offset}|>": VOCAB_SIZE + offset for offset in range(1, PADDED_VOCAB_SIZE - VOCAB_SIZE)}
36
- SPECIAL_TOKENS = {
37
- "<|dummy_0|>": 100256,
38
- "<|endoftext|>": 100257,
39
- "<|fim_prefix|>": 100258,
40
- "<|fim_middle|>": 100259,
41
- "<|fim_suffix|>": 100260,
42
- "<|dummy_1|>": 100261,
43
- "<|dummy_2|>": 100262,
44
- "<|dummy_3|>": 100263,
45
- "<|im_start|>": 100264,
46
- "<|im_end|>": 100265,
47
- "<|im_sep|>": 100266,
48
- "<|dummy_4|>": 100267,
49
- "<|dummy_5|>": 100268,
50
- "<|dummy_6|>": 100269,
51
- "<|dummy_7|>": 100270,
52
- "<|dummy_8|>": 100271,
53
- "<|dummy_9|>": 100272,
54
- "<|dummy_10|>": 100273,
55
- "<|dummy_11|>": 100274,
56
- "<|dummy_12|>": 100275,
57
- "<|endofprompt|>": 100276,
58
- **DUMMY_TOKENS,
59
- }
60
-
61
-
62
- def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
63
- with open(tiktoken_bpe_file, "rb") as f:
64
- contents = f.read()
65
- return {
66
- base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)
67
- }
68
-
69
-
70
- class Phi4Tokenizer(PreTrainedTokenizer):
71
- """
72
- Construct a Phi-4 tokenizer based on Titoken.
73
-
74
- Args:
75
- vocab_file (`str`, *optional*, defaults to `None`):
76
- Path to the vocabulary file.
77
- errors (`str`, *optional*, defaults to `'replace'`):
78
- How to handle errors with the tokenizer. Can be `'replace'`, `'ignore'` or `'raise'`.
79
- """
80
-
81
- vocab_files_names = VOCAB_FILES_NAMES
82
- model_input_names: List[str] = ["input_ids", "attention_mask"]
83
- padding_side = "left"
84
-
85
- def __init__(self, vocab_file: Optional[str] = None, errors: str = "replace", **kwargs) -> None:
86
- # `PreTrainedTokenizer.__init__()` calls `_add_tokens()` which checks if
87
- # the token is present in `self.special_tokens`. Thus, we instantiate it before to ensure
88
- # that the special tokens are present in `self.special_tokens`.
89
- self.special_tokens = SPECIAL_TOKENS
90
- self.errors = errors
91
-
92
- super().__init__(**kwargs)
93
-
94
- try:
95
- base = tiktoken.get_encoding("cl100k_base")
96
- except requests.RequestException:
97
- import hashlib
98
-
99
- from transformers.utils import cached_file
100
-
101
- cached_tokenizer_path = cached_file(
102
- "microsoft/phi-4",
103
- "cl100k_base.tiktoken",
104
- _raise_exceptions_for_gated_repo=False,
105
- _raise_exceptions_for_missing_entries=False,
106
- _raise_exceptions_for_connection_errors=False,
107
- )
108
-
109
- tiktoken_cache_dir = os.path.dirname(cached_tokenizer_path)
110
- tiktoken_cache_path = os.path.join(
111
- tiktoken_cache_dir,
112
- hashlib.sha1(
113
- "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken".encode()
114
- ).hexdigest(),
115
- )
116
-
117
- if not os.path.exists(tiktoken_cache_path):
118
- os.rename(cached_tokenizer_path, tiktoken_cache_path)
119
- os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
120
-
121
- base = tiktoken.get_encoding("cl100k_base")
122
-
123
- if vocab_file is None:
124
- self.mergeable_ranks = base._mergeable_ranks
125
- else:
126
- self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)
127
-
128
- self.pat_str = base._pat_str
129
- self.tokenizer = tiktoken.Encoding(
130
- name="phi4",
131
- pat_str=self.pat_str,
132
- mergeable_ranks=self.mergeable_ranks,
133
- special_tokens=self.special_tokens,
134
- )
135
-
136
- self.decoder: Dict[int, bytes] = {v: k for k, v in self.mergeable_ranks.items()}
137
- self.decoder.update({v: k for k, v in self.special_tokens.items()})
138
-
139
- self.eod_id = self.tokenizer.eot_token
140
- self._eos_token = self._convert_id_to_token(self.eod_id)
141
- self._bos_token = self._eos_token
142
-
143
- def __getstate__(self) -> Dict[str, Union[str, bytes, int]]:
144
- state = self.__dict__.copy()
145
- del state["tokenizer"]
146
- return state
147
-
148
- def __setstate__(self, state: Dict[str, Union[str, bytes, int]]) -> None:
149
- self.__dict__ = state
150
- self.tokenizer = tiktoken.Encoding(
151
- name="phi4",
152
- pat_str=self.pat_str,
153
- mergeable_ranks=self.mergeable_ranks,
154
- special_tokens=self.special_tokens,
155
- )
156
-
157
- def __len__(self) -> int:
158
- return self.tokenizer.n_vocab
159
-
160
- @cached_property
161
- def dummy_token_indices(self) -> List[int]:
162
- # Some additional tokens which are not used are considered as dummy tokens
163
- additional_tokens = ["<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|endofprompt|>"]
164
-
165
- dummy_token_indices = [index for token, index in self.special_tokens.items() if "dummy_id" in token]
166
- dummy_token_indices.extend([self.special_tokens[token] for token in additional_tokens])
167
-
168
- return sorted(dummy_token_indices)
169
-
170
- @property
171
- def vocab_size(self) -> int:
172
- return self.tokenizer.n_vocab
173
-
174
- @property
175
- def eos_token_id(self) -> int:
176
- return self.eod_id
177
-
178
- @classmethod
179
- def from_pretrained(
180
- cls,
181
- pretrained_model_name_or_path: Union[str, os.PathLike],
182
- *args,
183
- **kwargs,
184
- ) -> "Phi4Tokenizer":
185
- cls_kwargs = kwargs
186
-
187
- tokenization_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
188
- if tokenization_config:
189
- cls_kwargs = {**tokenization_config, **cls_kwargs}
190
- else:
191
- config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
192
- cls_kwargs["model_max_length"] = config.max_position_embeddings
193
-
194
- return cls(**cls_kwargs)
195
-
196
- def _add_tokens(
197
- self,
198
- new_tokens: Union[List[str], List[AddedToken]],
199
- special_tokens: bool = False,
200
- ) -> int:
201
- if not special_tokens and new_tokens:
202
- raise ValueError("Only special tokens can be added to this tokenizer")
203
-
204
- for token in new_tokens:
205
- surface_form = token.content if isinstance(token, AddedToken) else token
206
- if surface_form not in self.special_tokens:
207
- raise ValueError(
208
- "For now, we do not support unknown special tokens\n"
209
- "In the future, if there is a need for this, we can add special tokens to the tokenizer\n"
210
- "starting from rank 100261 - 100263 and then 100266 - 100275.\n"
211
- "And finally, we can re-construct the enc object back\n"
212
- )
213
-
214
- return 0
215
-
216
- def _strip_special_tokens(self, text: str) -> str:
217
- for special_token in self.special_tokens:
218
- pattern = rf"[ \r\n]*{re.escape(special_token)}[ \r\n]*"
219
- text = re.sub(pattern, special_token, text)
220
- return text
221
-
222
- def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
223
- if index in self.decoder:
224
- return self.decoder[index]
225
- return "<|dummy_0|>"
226
-
227
- def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
228
- if token in self.special_tokens:
229
- return self.special_tokens[token]
230
- if token in self.mergeable_ranks:
231
- return self.mergeable_ranks[token]
232
- return 100256
233
-
234
- def _decode(
235
- self,
236
- token_ids: Union[int, List[int]],
237
- skip_special_tokens: bool = False,
238
- errors: str = None,
239
- **kwargs,
240
- ) -> str:
241
- if isinstance(token_ids, int):
242
- token_ids = [token_ids]
243
- if skip_special_tokens:
244
- token_ids = [i for i in token_ids if i < self.eod_id]
245
-
246
- return self.tokenizer.decode(token_ids, errors=errors or self.errors)
247
-
248
- def _tokenize(self, text: str, **kwargs):
249
- raise NotImplementedError
250
-
251
- def convert_tokens_to_ids(self, tokens: Union[bytes, str, List[Union[bytes, str]]]) -> Union[int, List[int]]:
252
- if isinstance(tokens, (str, bytes)):
253
- if tokens in self.special_tokens:
254
- return self.special_tokens[tokens]
255
- return self.mergeable_ranks.get(tokens)
256
-
257
- ids = []
258
- for token in tokens:
259
- ids.append(self.convert_tokens_to_ids(token))
260
-
261
- return ids
262
-
263
- def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
264
- text = ""
265
- temp = b""
266
-
267
- for t in tokens:
268
- if isinstance(t, str):
269
- if temp:
270
- text += temp.decode("utf-8", errors=self.errors)
271
- temp = b""
272
- text += t
273
- elif isinstance(t, bytes):
274
- temp += t
275
- else:
276
- raise TypeError("token should only be of type types or str")
277
-
278
- if temp:
279
- text += temp.decode("utf-8", errors=self.errors)
280
-
281
- return text
282
-
283
- def get_vocab(self) -> Dict[Union[str, bytes], int]:
284
- return {**self.mergeable_ranks, **self.special_tokens}
285
-
286
- def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
287
- file_path = os.path.join(save_directory, "cl100k_base.tiktoken")
288
- with open(file_path, "w") as f:
289
- for token, rank in self.mergeable_ranks.items():
290
- line = base64.b64encode(token).decode("utf-8") + " " + str(rank) + "\n"
291
- f.write(line)
292
-
293
- return (file_path,)
294
-
295
- def tokenize(
296
- self,
297
- text: str,
298
- allowed_special: Union[Set, str] = "all",
299
- disallowed_special: Union[Collection, str] = (),
300
- **kwargs,
301
- ) -> List[Union[bytes, str]]:
302
- text = self._strip_special_tokens(text)
303
- return [
304
- self.decoder[token_id]
305
- for token_id in self.tokenizer.encode(text, allowed_special=allowed_special, disallowed_special=disallowed_special)
306
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,20 +1,780 @@
1
  {
2
- "_commit_hash": null,
3
- "_from_auto": true,
4
- "added_tokens_decoder": {},
5
- "auto_map": {
6
- "AutoTokenizer": [
7
- "tokenization_phi4.Phi4Tokenizer",
8
- null
9
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  },
11
  "bos_token": "<|endoftext|>",
12
  "chat_template": "{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|><|im_start|>assistant<|im_sep|>'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}",
13
- "clean_up_tokenization_spaces": true,
14
  "eos_token": "<|endoftext|>",
15
- "extra_special_tokens": {},
16
  "model_max_length": 16384,
17
  "pad_token": "<|endoftext|>",
18
- "tokenizer_class": "Phi4Tokenizer",
19
- "trust_remote_code": true
20
  }
 
1
  {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "100256": {
5
+ "content": "<|dummy_0|>",
6
+ "lstrip": true,
7
+ "normalized": false,
8
+ "rstrip": true,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "100257": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": true,
15
+ "normalized": false,
16
+ "rstrip": true,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "100258": {
21
+ "content": "<|fim_prefix|>",
22
+ "lstrip": true,
23
+ "normalized": false,
24
+ "rstrip": true,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "100259": {
29
+ "content": "<|fim_middle|>",
30
+ "lstrip": true,
31
+ "normalized": false,
32
+ "rstrip": true,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "100260": {
37
+ "content": "<|fim_suffix|>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": true,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "100261": {
45
+ "content": "<|dummy_1|>",
46
+ "lstrip": true,
47
+ "normalized": false,
48
+ "rstrip": true,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "100262": {
53
+ "content": "<|dummy_2|>",
54
+ "lstrip": true,
55
+ "normalized": false,
56
+ "rstrip": true,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "100263": {
61
+ "content": "<|dummy_3|>",
62
+ "lstrip": true,
63
+ "normalized": false,
64
+ "rstrip": true,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "100264": {
69
+ "content": "<|im_start|>",
70
+ "lstrip": true,
71
+ "normalized": false,
72
+ "rstrip": true,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "100265": {
77
+ "content": "<|im_end|>",
78
+ "lstrip": true,
79
+ "normalized": false,
80
+ "rstrip": true,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "100266": {
85
+ "content": "<|im_sep|>",
86
+ "lstrip": true,
87
+ "normalized": false,
88
+ "rstrip": true,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "100267": {
93
+ "content": "<|dummy_4|>",
94
+ "lstrip": true,
95
+ "normalized": false,
96
+ "rstrip": true,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "100268": {
101
+ "content": "<|dummy_5|>",
102
+ "lstrip": true,
103
+ "normalized": false,
104
+ "rstrip": true,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "100269": {
109
+ "content": "<|dummy_6|>",
110
+ "lstrip": true,
111
+ "normalized": false,
112
+ "rstrip": true,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "100270": {
117
+ "content": "<|dummy_7|>",
118
+ "lstrip": true,
119
+ "normalized": false,
120
+ "rstrip": true,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "100271": {
125
+ "content": "<|dummy_8|>",
126
+ "lstrip": true,
127
+ "normalized": false,
128
+ "rstrip": true,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "100272": {
133
+ "content": "<|dummy_9|>",
134
+ "lstrip": true,
135
+ "normalized": false,
136
+ "rstrip": true,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "100273": {
141
+ "content": "<|dummy_10|>",
142
+ "lstrip": true,
143
+ "normalized": false,
144
+ "rstrip": true,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "100274": {
149
+ "content": "<|dummy_11|>",
150
+ "lstrip": true,
151
+ "normalized": false,
152
+ "rstrip": true,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "100275": {
157
+ "content": "<|dummy_12|>",
158
+ "lstrip": true,
159
+ "normalized": false,
160
+ "rstrip": true,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "100276": {
165
+ "content": "<|endofprompt|>",
166
+ "lstrip": true,
167
+ "normalized": false,
168
+ "rstrip": true,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "100277": {
173
+ "content": "<|dummy_13|>",
174
+ "lstrip": true,
175
+ "normalized": false,
176
+ "rstrip": true,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "100278": {
181
+ "content": "<|dummy_14|>",
182
+ "lstrip": true,
183
+ "normalized": false,
184
+ "rstrip": true,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "100279": {
189
+ "content": "<|dummy_15|>",
190
+ "lstrip": true,
191
+ "normalized": false,
192
+ "rstrip": true,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "100280": {
197
+ "content": "<|dummy_16|>",
198
+ "lstrip": true,
199
+ "normalized": false,
200
+ "rstrip": true,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "100281": {
205
+ "content": "<|dummy_17|>",
206
+ "lstrip": true,
207
+ "normalized": false,
208
+ "rstrip": true,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "100282": {
213
+ "content": "<|dummy_18|>",
214
+ "lstrip": true,
215
+ "normalized": false,
216
+ "rstrip": true,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "100283": {
221
+ "content": "<|dummy_19|>",
222
+ "lstrip": true,
223
+ "normalized": false,
224
+ "rstrip": true,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "100284": {
229
+ "content": "<|dummy_20|>",
230
+ "lstrip": true,
231
+ "normalized": false,
232
+ "rstrip": true,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "100285": {
237
+ "content": "<|dummy_21|>",
238
+ "lstrip": true,
239
+ "normalized": false,
240
+ "rstrip": true,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "100286": {
245
+ "content": "<|dummy_22|>",
246
+ "lstrip": true,
247
+ "normalized": false,
248
+ "rstrip": true,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "100287": {
253
+ "content": "<|dummy_23|>",
254
+ "lstrip": true,
255
+ "normalized": false,
256
+ "rstrip": true,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "100288": {
261
+ "content": "<|dummy_24|>",
262
+ "lstrip": true,
263
+ "normalized": false,
264
+ "rstrip": true,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "100289": {
269
+ "content": "<|dummy_25|>",
270
+ "lstrip": true,
271
+ "normalized": false,
272
+ "rstrip": true,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "100290": {
277
+ "content": "<|dummy_26|>",
278
+ "lstrip": true,
279
+ "normalized": false,
280
+ "rstrip": true,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "100291": {
285
+ "content": "<|dummy_27|>",
286
+ "lstrip": true,
287
+ "normalized": false,
288
+ "rstrip": true,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "100292": {
293
+ "content": "<|dummy_28|>",
294
+ "lstrip": true,
295
+ "normalized": false,
296
+ "rstrip": true,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "100293": {
301
+ "content": "<|dummy_29|>",
302
+ "lstrip": true,
303
+ "normalized": false,
304
+ "rstrip": true,
305
+ "single_word": false,
306
+ "special": true
307
+ },
308
+ "100294": {
309
+ "content": "<|dummy_30|>",
310
+ "lstrip": true,
311
+ "normalized": false,
312
+ "rstrip": true,
313
+ "single_word": false,
314
+ "special": true
315
+ },
316
+ "100295": {
317
+ "content": "<|dummy_31|>",
318
+ "lstrip": true,
319
+ "normalized": false,
320
+ "rstrip": true,
321
+ "single_word": false,
322
+ "special": true
323
+ },
324
+ "100296": {
325
+ "content": "<|dummy_32|>",
326
+ "lstrip": true,
327
+ "normalized": false,
328
+ "rstrip": true,
329
+ "single_word": false,
330
+ "special": true
331
+ },
332
+ "100297": {
333
+ "content": "<|dummy_33|>",
334
+ "lstrip": true,
335
+ "normalized": false,
336
+ "rstrip": true,
337
+ "single_word": false,
338
+ "special": true
339
+ },
340
+ "100298": {
341
+ "content": "<|dummy_34|>",
342
+ "lstrip": true,
343
+ "normalized": false,
344
+ "rstrip": true,
345
+ "single_word": false,
346
+ "special": true
347
+ },
348
+ "100299": {
349
+ "content": "<|dummy_35|>",
350
+ "lstrip": true,
351
+ "normalized": false,
352
+ "rstrip": true,
353
+ "single_word": false,
354
+ "special": true
355
+ },
356
+ "100300": {
357
+ "content": "<|dummy_36|>",
358
+ "lstrip": true,
359
+ "normalized": false,
360
+ "rstrip": true,
361
+ "single_word": false,
362
+ "special": true
363
+ },
364
+ "100301": {
365
+ "content": "<|dummy_37|>",
366
+ "lstrip": true,
367
+ "normalized": false,
368
+ "rstrip": true,
369
+ "single_word": false,
370
+ "special": true
371
+ },
372
+ "100302": {
373
+ "content": "<|dummy_38|>",
374
+ "lstrip": true,
375
+ "normalized": false,
376
+ "rstrip": true,
377
+ "single_word": false,
378
+ "special": true
379
+ },
380
+ "100303": {
381
+ "content": "<|dummy_39|>",
382
+ "lstrip": true,
383
+ "normalized": false,
384
+ "rstrip": true,
385
+ "single_word": false,
386
+ "special": true
387
+ },
388
+ "100304": {
389
+ "content": "<|dummy_40|>",
390
+ "lstrip": true,
391
+ "normalized": false,
392
+ "rstrip": true,
393
+ "single_word": false,
394
+ "special": true
395
+ },
396
+ "100305": {
397
+ "content": "<|dummy_41|>",
398
+ "lstrip": true,
399
+ "normalized": false,
400
+ "rstrip": true,
401
+ "single_word": false,
402
+ "special": true
403
+ },
404
+ "100306": {
405
+ "content": "<|dummy_42|>",
406
+ "lstrip": true,
407
+ "normalized": false,
408
+ "rstrip": true,
409
+ "single_word": false,
410
+ "special": true
411
+ },
412
+ "100307": {
413
+ "content": "<|dummy_43|>",
414
+ "lstrip": true,
415
+ "normalized": false,
416
+ "rstrip": true,
417
+ "single_word": false,
418
+ "special": true
419
+ },
420
+ "100308": {
421
+ "content": "<|dummy_44|>",
422
+ "lstrip": true,
423
+ "normalized": false,
424
+ "rstrip": true,
425
+ "single_word": false,
426
+ "special": true
427
+ },
428
+ "100309": {
429
+ "content": "<|dummy_45|>",
430
+ "lstrip": true,
431
+ "normalized": false,
432
+ "rstrip": true,
433
+ "single_word": false,
434
+ "special": true
435
+ },
436
+ "100310": {
437
+ "content": "<|dummy_46|>",
438
+ "lstrip": true,
439
+ "normalized": false,
440
+ "rstrip": true,
441
+ "single_word": false,
442
+ "special": true
443
+ },
444
+ "100311": {
445
+ "content": "<|dummy_47|>",
446
+ "lstrip": true,
447
+ "normalized": false,
448
+ "rstrip": true,
449
+ "single_word": false,
450
+ "special": true
451
+ },
452
+ "100312": {
453
+ "content": "<|dummy_48|>",
454
+ "lstrip": true,
455
+ "normalized": false,
456
+ "rstrip": true,
457
+ "single_word": false,
458
+ "special": true
459
+ },
460
+ "100313": {
461
+ "content": "<|dummy_49|>",
462
+ "lstrip": true,
463
+ "normalized": false,
464
+ "rstrip": true,
465
+ "single_word": false,
466
+ "special": true
467
+ },
468
+ "100314": {
469
+ "content": "<|dummy_50|>",
470
+ "lstrip": true,
471
+ "normalized": false,
472
+ "rstrip": true,
473
+ "single_word": false,
474
+ "special": true
475
+ },
476
+ "100315": {
477
+ "content": "<|dummy_51|>",
478
+ "lstrip": true,
479
+ "normalized": false,
480
+ "rstrip": true,
481
+ "single_word": false,
482
+ "special": true
483
+ },
484
+ "100316": {
485
+ "content": "<|dummy_52|>",
486
+ "lstrip": true,
487
+ "normalized": false,
488
+ "rstrip": true,
489
+ "single_word": false,
490
+ "special": true
491
+ },
492
+ "100317": {
493
+ "content": "<|dummy_53|>",
494
+ "lstrip": true,
495
+ "normalized": false,
496
+ "rstrip": true,
497
+ "single_word": false,
498
+ "special": true
499
+ },
500
+ "100318": {
501
+ "content": "<|dummy_54|>",
502
+ "lstrip": true,
503
+ "normalized": false,
504
+ "rstrip": true,
505
+ "single_word": false,
506
+ "special": true
507
+ },
508
+ "100319": {
509
+ "content": "<|dummy_55|>",
510
+ "lstrip": true,
511
+ "normalized": false,
512
+ "rstrip": true,
513
+ "single_word": false,
514
+ "special": true
515
+ },
516
+ "100320": {
517
+ "content": "<|dummy_56|>",
518
+ "lstrip": true,
519
+ "normalized": false,
520
+ "rstrip": true,
521
+ "single_word": false,
522
+ "special": true
523
+ },
524
+ "100321": {
525
+ "content": "<|dummy_57|>",
526
+ "lstrip": true,
527
+ "normalized": false,
528
+ "rstrip": true,
529
+ "single_word": false,
530
+ "special": true
531
+ },
532
+ "100322": {
533
+ "content": "<|dummy_58|>",
534
+ "lstrip": true,
535
+ "normalized": false,
536
+ "rstrip": true,
537
+ "single_word": false,
538
+ "special": true
539
+ },
540
+ "100323": {
541
+ "content": "<|dummy_59|>",
542
+ "lstrip": true,
543
+ "normalized": false,
544
+ "rstrip": true,
545
+ "single_word": false,
546
+ "special": true
547
+ },
548
+ "100324": {
549
+ "content": "<|dummy_60|>",
550
+ "lstrip": true,
551
+ "normalized": false,
552
+ "rstrip": true,
553
+ "single_word": false,
554
+ "special": true
555
+ },
556
+ "100325": {
557
+ "content": "<|dummy_61|>",
558
+ "lstrip": true,
559
+ "normalized": false,
560
+ "rstrip": true,
561
+ "single_word": false,
562
+ "special": true
563
+ },
564
+ "100326": {
565
+ "content": "<|dummy_62|>",
566
+ "lstrip": true,
567
+ "normalized": false,
568
+ "rstrip": true,
569
+ "single_word": false,
570
+ "special": true
571
+ },
572
+ "100327": {
573
+ "content": "<|dummy_63|>",
574
+ "lstrip": true,
575
+ "normalized": false,
576
+ "rstrip": true,
577
+ "single_word": false,
578
+ "special": true
579
+ },
580
+ "100328": {
581
+ "content": "<|dummy_64|>",
582
+ "lstrip": true,
583
+ "normalized": false,
584
+ "rstrip": true,
585
+ "single_word": false,
586
+ "special": true
587
+ },
588
+ "100329": {
589
+ "content": "<|dummy_65|>",
590
+ "lstrip": true,
591
+ "normalized": false,
592
+ "rstrip": true,
593
+ "single_word": false,
594
+ "special": true
595
+ },
596
+ "100330": {
597
+ "content": "<|dummy_66|>",
598
+ "lstrip": true,
599
+ "normalized": false,
600
+ "rstrip": true,
601
+ "single_word": false,
602
+ "special": true
603
+ },
604
+ "100331": {
605
+ "content": "<|dummy_67|>",
606
+ "lstrip": true,
607
+ "normalized": false,
608
+ "rstrip": true,
609
+ "single_word": false,
610
+ "special": true
611
+ },
612
+ "100332": {
613
+ "content": "<|dummy_68|>",
614
+ "lstrip": true,
615
+ "normalized": false,
616
+ "rstrip": true,
617
+ "single_word": false,
618
+ "special": true
619
+ },
620
+ "100333": {
621
+ "content": "<|dummy_69|>",
622
+ "lstrip": true,
623
+ "normalized": false,
624
+ "rstrip": true,
625
+ "single_word": false,
626
+ "special": true
627
+ },
628
+ "100334": {
629
+ "content": "<|dummy_70|>",
630
+ "lstrip": true,
631
+ "normalized": false,
632
+ "rstrip": true,
633
+ "single_word": false,
634
+ "special": true
635
+ },
636
+ "100335": {
637
+ "content": "<|dummy_71|>",
638
+ "lstrip": true,
639
+ "normalized": false,
640
+ "rstrip": true,
641
+ "single_word": false,
642
+ "special": true
643
+ },
644
+ "100336": {
645
+ "content": "<|dummy_72|>",
646
+ "lstrip": true,
647
+ "normalized": false,
648
+ "rstrip": true,
649
+ "single_word": false,
650
+ "special": true
651
+ },
652
+ "100337": {
653
+ "content": "<|dummy_73|>",
654
+ "lstrip": true,
655
+ "normalized": false,
656
+ "rstrip": true,
657
+ "single_word": false,
658
+ "special": true
659
+ },
660
+ "100338": {
661
+ "content": "<|dummy_74|>",
662
+ "lstrip": true,
663
+ "normalized": false,
664
+ "rstrip": true,
665
+ "single_word": false,
666
+ "special": true
667
+ },
668
+ "100339": {
669
+ "content": "<|dummy_75|>",
670
+ "lstrip": true,
671
+ "normalized": false,
672
+ "rstrip": true,
673
+ "single_word": false,
674
+ "special": true
675
+ },
676
+ "100340": {
677
+ "content": "<|dummy_76|>",
678
+ "lstrip": true,
679
+ "normalized": false,
680
+ "rstrip": true,
681
+ "single_word": false,
682
+ "special": true
683
+ },
684
+ "100341": {
685
+ "content": "<|dummy_77|>",
686
+ "lstrip": true,
687
+ "normalized": false,
688
+ "rstrip": true,
689
+ "single_word": false,
690
+ "special": true
691
+ },
692
+ "100342": {
693
+ "content": "<|dummy_78|>",
694
+ "lstrip": true,
695
+ "normalized": false,
696
+ "rstrip": true,
697
+ "single_word": false,
698
+ "special": true
699
+ },
700
+ "100343": {
701
+ "content": "<|dummy_79|>",
702
+ "lstrip": true,
703
+ "normalized": false,
704
+ "rstrip": true,
705
+ "single_word": false,
706
+ "special": true
707
+ },
708
+ "100344": {
709
+ "content": "<|dummy_80|>",
710
+ "lstrip": true,
711
+ "normalized": false,
712
+ "rstrip": true,
713
+ "single_word": false,
714
+ "special": true
715
+ },
716
+ "100345": {
717
+ "content": "<|dummy_81|>",
718
+ "lstrip": true,
719
+ "normalized": false,
720
+ "rstrip": true,
721
+ "single_word": false,
722
+ "special": true
723
+ },
724
+ "100346": {
725
+ "content": "<|dummy_82|>",
726
+ "lstrip": true,
727
+ "normalized": false,
728
+ "rstrip": true,
729
+ "single_word": false,
730
+ "special": true
731
+ },
732
+ "100347": {
733
+ "content": "<|dummy_83|>",
734
+ "lstrip": true,
735
+ "normalized": false,
736
+ "rstrip": true,
737
+ "single_word": false,
738
+ "special": true
739
+ },
740
+ "100348": {
741
+ "content": "<|dummy_84|>",
742
+ "lstrip": true,
743
+ "normalized": false,
744
+ "rstrip": true,
745
+ "single_word": false,
746
+ "special": true
747
+ },
748
+ "100349": {
749
+ "content": "<|dummy_85|>",
750
+ "lstrip": true,
751
+ "normalized": false,
752
+ "rstrip": true,
753
+ "single_word": false,
754
+ "special": true
755
+ },
756
+ "100350": {
757
+ "content": "<|dummy_86|>",
758
+ "lstrip": true,
759
+ "normalized": false,
760
+ "rstrip": true,
761
+ "single_word": false,
762
+ "special": true
763
+ },
764
+ "100351": {
765
+ "content": "<|dummy_87|>",
766
+ "lstrip": true,
767
+ "normalized": false,
768
+ "rstrip": true,
769
+ "single_word": false,
770
+ "special": true
771
+ }
772
  },
773
  "bos_token": "<|endoftext|>",
774
  "chat_template": "{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|><|im_start|>assistant<|im_sep|>'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}",
775
+ "clean_up_tokenization_spaces": false,
776
  "eos_token": "<|endoftext|>",
 
777
  "model_max_length": 16384,
778
  "pad_token": "<|endoftext|>",
779
+ "tokenizer_class": "GPT2Tokenizer"
 
780
  }
vocab.json ADDED
The diff for this file is too large to render. See raw diff