Update tokenization.py
Browse files- tokenization.py +87 -18
tokenization.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import logging
|
2 |
import re
|
3 |
from typing import Optional
|
@@ -24,6 +25,44 @@ _INFINITE = int(1e12) # infinite token length for no-truncation
|
|
24 |
logger = logging.getLogger("kanana-1.5-v")
|
25 |
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def _pad_trunc(
|
28 |
x: list[list[int]],
|
29 |
padding: str,
|
@@ -101,20 +140,6 @@ class KananaVTokenizerMixin:
|
|
101 |
|
102 |
return repeated_tokens
|
103 |
|
104 |
-
def encode_text_only(self, prompt: str, add_special_tokens: bool = False) -> list:
|
105 |
-
# Text-only Data
|
106 |
-
# split prompt into chunks by role tokens
|
107 |
-
tokens_to_split = [_AI, _HUMAN]
|
108 |
-
pattern = "|".join(map(re.escape, tokens_to_split))
|
109 |
-
chunk_strs = re.split(f"({pattern})", prompt)
|
110 |
-
chunk_strs = [x for x in chunk_strs if len(x) > 0]
|
111 |
-
|
112 |
-
enc_chunk = []
|
113 |
-
for idx, chunk_str in enumerate(chunk_strs):
|
114 |
-
curr_chunk = self(chunk_str, add_special_tokens=False)["input_ids"]
|
115 |
-
enc_chunk += curr_chunk
|
116 |
-
return enc_chunk
|
117 |
-
|
118 |
def encode_prompt(
|
119 |
self, prompt: str, max_length: int | None = None, image_meta: dict | None = None
|
120 |
) -> dict:
|
@@ -228,13 +253,57 @@ class KananaVTokenizer(PreTrainedTokenizer, KananaVTokenizerMixin):
|
|
228 |
def __init__(self, **kwargs):
|
229 |
super().__init__(**kwargs)
|
230 |
|
231 |
-
def
|
232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
|
235 |
class KananaVTokenizerFast(PreTrainedTokenizerFast, KananaVTokenizerMixin):
|
236 |
def __init__(self, **kwargs):
|
237 |
super().__init__(**kwargs)
|
238 |
|
239 |
-
def
|
240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import defaultdict
|
2 |
import logging
|
3 |
import re
|
4 |
from typing import Optional
|
|
|
25 |
logger = logging.getLogger("kanana-1.5-v")
|
26 |
|
27 |
|
28 |
+
class AttrDict(dict):
|
29 |
+
__slots__ = ()
|
30 |
+
|
31 |
+
def __getattr__(self, name):
|
32 |
+
try:
|
33 |
+
val = self[name]
|
34 |
+
except KeyError:
|
35 |
+
raise AttributeError(name) from None
|
36 |
+
|
37 |
+
if isinstance(val, dict) and not isinstance(val, AttrDict):
|
38 |
+
val = AttrDict(val)
|
39 |
+
self[name] = val
|
40 |
+
return val
|
41 |
+
|
42 |
+
def __setattr__(self, name, value):
|
43 |
+
if name.startswith('_'):
|
44 |
+
return super().__setattr__(name, value)
|
45 |
+
if isinstance(value, dict) and not isinstance(value, AttrDict):
|
46 |
+
value = AttrDict(value)
|
47 |
+
self[name] = value
|
48 |
+
|
49 |
+
def __delattr__(self, name):
|
50 |
+
try:
|
51 |
+
del self[name]
|
52 |
+
except KeyError:
|
53 |
+
raise AttributeError(name) from None
|
54 |
+
|
55 |
+
|
56 |
+
def to_attrdict(obj):
|
57 |
+
if isinstance(obj, dict) and not isinstance(obj, AttrDict):
|
58 |
+
return AttrDict({k: to_attrdict(v) for k, v in obj.items()})
|
59 |
+
if isinstance(obj, list):
|
60 |
+
return [to_attrdict(x) for x in obj]
|
61 |
+
if isinstance(obj, tuple):
|
62 |
+
return tuple(to_attrdict(x) for x in obj)
|
63 |
+
return obj
|
64 |
+
|
65 |
+
|
66 |
def _pad_trunc(
|
67 |
x: list[list[int]],
|
68 |
padding: str,
|
|
|
140 |
|
141 |
return repeated_tokens
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
def encode_prompt(
|
144 |
self, prompt: str, max_length: int | None = None, image_meta: dict | None = None
|
145 |
) -> dict:
|
|
|
253 |
def __init__(self, **kwargs):
|
254 |
super().__init__(**kwargs)
|
255 |
|
256 |
+
def __call__(self, text, *args, **kwargs):
|
257 |
+
assert isinstance(text, str), "Only str is supported for tokenization."
|
258 |
+
|
259 |
+
# split prompt into chunks by role tokens: text (str) -> chunk_strs (list)
|
260 |
+
tokens_to_split = [_AI, _HUMAN]
|
261 |
+
pattern = "|".join(map(re.escape, tokens_to_split))
|
262 |
+
if re.search(pattern, text):
|
263 |
+
chunk_strs = re.split(f"({pattern})", text)
|
264 |
+
chunk_strs = [x for x in chunk_strs if len(x) > 0]
|
265 |
+
|
266 |
+
# encode chunk strs
|
267 |
+
kwargs["add_special_tokens"] = False
|
268 |
+
encodings = defaultdict(list)
|
269 |
+
for chunk_str in chunk_strs:
|
270 |
+
encoding = super().__call__(chunk_str, *args, **kwargs)
|
271 |
+
for k, v in encoding.items():
|
272 |
+
encodings[k].extend(v)
|
273 |
+
encodings = to_attrdict(encodings)
|
274 |
+
return encodings
|
275 |
+
else:
|
276 |
+
return super().__call__(text, *args, **kwargs)
|
277 |
+
|
278 |
+
def encode(self, *args, **kwargs):
|
279 |
+
return self.__call__(*args, **kwargs)["input_ids"]
|
280 |
|
281 |
|
282 |
class KananaVTokenizerFast(PreTrainedTokenizerFast, KananaVTokenizerMixin):
|
283 |
def __init__(self, **kwargs):
|
284 |
super().__init__(**kwargs)
|
285 |
|
286 |
+
def __call__(self, text, *args, **kwargs):
|
287 |
+
assert isinstance(text, str), "Only str is supported for fast tokenization."
|
288 |
+
|
289 |
+
# split prompt into chunks by role tokens: text (str) -> chunk_strs (list)
|
290 |
+
tokens_to_split = [_AI, _HUMAN]
|
291 |
+
pattern = "|".join(map(re.escape, tokens_to_split))
|
292 |
+
if re.search(pattern, text):
|
293 |
+
chunk_strs = re.split(f"({pattern})", text)
|
294 |
+
chunk_strs = [x for x in chunk_strs if len(x) > 0]
|
295 |
+
|
296 |
+
# encode chunk strs
|
297 |
+
kwargs["add_special_tokens"] = False
|
298 |
+
encodings = defaultdict(list)
|
299 |
+
for chunk_str in chunk_strs:
|
300 |
+
encoding = super().__call__(chunk_str, *args, **kwargs)
|
301 |
+
for k, v in encoding.items():
|
302 |
+
encodings[k].extend(v)
|
303 |
+
encodings = to_attrdict(encodings)
|
304 |
+
return encodings
|
305 |
+
else:
|
306 |
+
return super().__call__(text, *args, **kwargs)
|
307 |
+
|
308 |
+
def encode(self, *args, **kwargs):
|
309 |
+
return self.__call__(*args, **kwargs)["input_ids"]
|