File size: 11,251 Bytes
c102e8c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 |
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Megatron tokenizers."""
from transformers.tokenization_utils import PreTrainedTokenizer
from typing import Union
from typing import (
AbstractSet,
cast,
Collection,
Dict,
Iterator,
List,
Literal,
Sequence,
Union,
Optional,
)
from tiktoken.load import load_tiktoken_bpe
import tiktoken
from pathlib import Path
import os
import logging
from tokenizers import AddedToken
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
class TikTokenTokenizer(PreTrainedTokenizer):
"""
Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
"""
special_tokens: Dict[str, int]
num_reserved_special_tokens = 293 + 128
pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
vocab_files_names = VOCAB_FILES_NAMES
def __init__(
self,
vocab_file,
bos_token: Union[str, AddedToken] = "[BOS]",
eos_token: Union[str, AddedToken] = "[EOS]",
unk_token: Union[str, AddedToken] = "[UNK]",
pad_token: Union[str, AddedToken] = "[PAD]",
additional_special_tokens: Optional[List[str]] = None,
added_tokens_decoder: Optional[dict] = None,
**kwargs,
):
"""
Initializes the Tokenizer with a Tiktoken model.
Args:
model_path (str): The path to the Tiktoken model file.
"""
assert os.path.isfile(vocab_file), vocab_file
mergeable_ranks = load_tiktoken_bpe(vocab_file)
num_base_tokens = len(mergeable_ranks)
used_special_tokens = [
"[BOS]",
"[EOS]",
"<|im_msg_end|>", # 0
"<|im_user_msg_start|>", # 1
"<|im_assistant_msg_start|>", # 2
"<|reserved_token_0|>", # 3
"<|reserved_token_1|>",
"<|reserved_token_2|>",
"<|reserved_token_3|>", # 4
"[EOT]",
"<|reserved_token_4|>", # 5
"<|reserved_token_5|>", # 6
"<|reserved_token_6|>", # 7
"<|reserved_token_7|>", # 8
"<|reserved_token_8|>", # 9
"<|reserved_token_9|>", # 10
"<|reserved_token_10|>", # 11
"<|reserved_token_11|>", # 12
"<|im_media_begin|>", # 13
"<|reserved_token_12|>", # 14
"<|im_media_end|>", # 15
"<|reserved_token_13|>", # 16
"<|reserved_token_14|>", # 17
"<|im_kimia_text_blank|>", # 18
"<|im_kimia_text_eos|>", # 19
"<|reserved_token_15|>", # 20
"<|reserved_token_16|>", # 21
"<|im_kimia_user_msg_start|>", # 22
"<|im_kimia_assistant_msg_start|>", # 23
"<|reserved_token_17|>", # 24
"<|reserved_token_18|>", # 25
"<|reserved_token_19|>", # 26
"<|im_kimia_speech_ct_id|>", # 27
"<|im_kimia_speech_ctd_id|>", # 28
]
autoset_special_tokens = [
f"<|reserved_token_{i}|>"
for i in range(
20, self.num_reserved_special_tokens - len(used_special_tokens) + 20
)
]
special_tokens = used_special_tokens + autoset_special_tokens
self.special_tokens = {
token: num_base_tokens + i for i, token in enumerate(special_tokens)
}
self.model = tiktoken.Encoding(
name=Path(vocab_file).name,
pat_str=self.pat_str,
mergeable_ranks=mergeable_ranks,
special_tokens=self.special_tokens,
)
logger.info(f"Reloaded tiktoken model from {vocab_file}")
self.n_words: int = self.model.n_vocab
# BOS / EOS token IDs
self.bos_token = "[BOS]"
self.bos_id: int = self.special_tokens["[BOS]"]
self.eos_token = "[EOS]"
self.eos_id: int = self.special_tokens["[EOS]"]
# use last speical token as pad token, the last - 1 is unk_token
self.pad_token: str = special_tokens[-1]
self.pad_id: int = self.special_tokens[self.pad_token]
self.unk_token: str = special_tokens[-2]
self.unk_id: int = self.special_tokens[self.pad_token]
self.stop_tokens = {
self.special_tokens["[EOS]"],
self.special_tokens["[EOT]"],
}
logger.info(
f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
)
def encode(
self,
s: str,
*,
bos: bool,
eos: bool,
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
disallowed_special: Union[Literal["all"], Collection[str]] = (),
) -> List[int]:
"""
Encodes a string into a list of token IDs.
Args:
s (str): The input string to be encoded.
bos (bool): Whether to prepend the beginning-of-sequence token.
eos (bool): Whether to append the end-of-sequence token.
allowed_tokens ("all"|set[str]): allowed special tokens in string
disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string
Returns:
list[int]: A list of token IDs.
By default, setting disallowed_special=() encodes a string by ignoring
special tokens. Specifically:
- Setting `disallowed_special` to () will cause all text corresponding
to special tokens to be encoded as natural text (insteading of raising
an error).
- Setting `allowed_special` to "all" will treat all text corresponding
to special tokens to be encoded as special tokens.
"""
assert type(s) is str
# The tiktoken tokenizer can handle <=400k chars without
# pyo3_runtime.PanicException.
TIKTOKEN_MAX_ENCODE_CHARS = 400_000
# https://github.com/openai/tiktoken/issues/195
# Here we iterate over subsequences and split if we exceed the limit
# of max consecutive non-whitespace or whitespace characters.
MAX_NO_WHITESPACES_CHARS = 25_000
substrs = (
substr
for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
for substr in self._split_whitespaces_or_nonwhitespaces(
s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
)
)
t: List[int] = []
for substr in substrs:
t.extend(
self.model.encode(
substr,
allowed_special=allowed_special,
disallowed_special=disallowed_special,
)
)
if bos:
t.insert(0, self.bos_id)
if eos:
t.append(self.eos_id)
return t
def decode(self, t: Sequence[int]) -> str:
"""
Decodes a list of token IDs into a string.
Args:
t (List[int]): The list of token IDs to be decoded.
Returns:
str: The decoded string.
"""
# Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
return self.model.decode(cast(List[int], t))
@staticmethod
def _split_whitespaces_or_nonwhitespaces(
s: str, max_consecutive_slice_len: int
) -> Iterator[str]:
"""
Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
consecutive whitespaces or consecutive non-whitespaces.
"""
current_slice_len = 0
current_slice_is_space = s[0].isspace() if len(s) > 0 else False
slice_start = 0
for i in range(len(s)):
is_now_space = s[i].isspace()
if current_slice_is_space ^ is_now_space:
current_slice_len = 1
current_slice_is_space = is_now_space
else:
current_slice_len += 1
if current_slice_len > max_consecutive_slice_len:
yield s[slice_start:i]
slice_start = i
current_slice_len = 1
yield s[slice_start:]
""" ----- Below are the abstract methods required by megatron ----- """
@property
def vocab_size(self):
return self.n_words
@property
def vocab(self):
if hasattr(self, "str_vocab"):
return self.str_vocab
self.str_vocab = {}
# convert mergeable_ranks from bytes to string
utf8_num, unicode_num = 0, 0
for byte_key, index in self.model._mergeable_ranks.items():
try:
str_key = byte_key.decode("utf-8")
utf8_num += 1
except UnicodeDecodeError:
# use backslashreplace so we can get num vocab different tokens
# see: https://docs.python.org/3/howto/unicode.html
# this vocab is only used for offline processing, so this is fine
str_key = byte_key.decode("utf-8", "backslashreplace") + "_unicode_"
unicode_num += 1
self.str_vocab[str_key] = index
logger.info(f"num utf8: {utf8_num}, num unicode: {unicode_num}")
# add all special tokens to the dictionary
self.str_vocab.update(self.model._special_tokens)
assert len(self.str_vocab) == self.vocab_size
return self.str_vocab
@property
def inv_vocab(self):
return {v: k for k, v in self.vocab.items()}
def tokenize(self, text, eos=True):
# BOS: always add bos token
# EOS:
# Most cases should be true when we are tokenizing a full sequence
# Only setting to false when we are running a inference
return self.encode(text, bos=True, eos=eos)
def detokenize(self, tokens):
# convert tensor to list if needed...
if not isinstance(tokens, list):
tokens = tokens.tolist()
return self.decode(tokens)
@property
def eod(self):
return self.eos_id
def bod(self):
return self.bos_id
@property
def msk_start_id(self):
return self.msk_start
@property
def msk_end_id(self):
return self.msk_end
def _get_index_2_bytes(self):
if hasattr(self, "index_2_bytes"):
return self.index_2_bytes
# use array rather than dict for faster access
self.index_2_bytes = [0] * self.model.n_vocab
for byte_key, index in self.model._mergeable_ranks.items():
self.index_2_bytes[index] = len(byte_key)
for _, index in self.model._special_tokens.items():
# in total we have 256 special tokens, 2^8 = 256
# so the num of bytes of each token is only 1
self.index_2_bytes[index] = 1
return self.index_2_bytes
def get_array_bytes(self, array):
index_2_bytes = self._get_index_2_bytes()
return sum(index_2_bytes[i] for i in array)
@property
def eos_token_id(self):
return self.eos_id
@property
def pad_token_id(self):
return self.pad_id
|