kakaocorp
/

kanana-1.5-v-3b-instruct

@@ -1,3 +1,4 @@
 import logging
 import re
 from typing import Optional
@@ -24,6 +25,44 @@ _INFINITE = int(1e12)  # infinite token length for no-truncation
 logger = logging.getLogger("kanana-1.5-v")
 def _pad_trunc(
     x: list[list[int]],
     padding: str,
@@ -101,20 +140,6 @@ class KananaVTokenizerMixin:
         return repeated_tokens
-    def encode_text_only(self, prompt: str, add_special_tokens: bool = False) -> list:
-        # Text-only Data
-        # split prompt into chunks by role tokens
-        tokens_to_split = [_AI, _HUMAN]
-        pattern = "|".join(map(re.escape, tokens_to_split))
-        chunk_strs = re.split(f"({pattern})", prompt)
-        chunk_strs = [x for x in chunk_strs if len(x) > 0]
-        enc_chunk = []
-        for idx, chunk_str in enumerate(chunk_strs):
-            curr_chunk = self(chunk_str, add_special_tokens=False)["input_ids"]
-            enc_chunk += curr_chunk
-        return enc_chunk
     def encode_prompt(
         self, prompt: str, max_length: int | None = None, image_meta: dict | None = None
     ) -> dict:
@@ -228,13 +253,57 @@ class KananaVTokenizer(PreTrainedTokenizer, KananaVTokenizerMixin):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-    def encode(self, text, add_special_tokens=False) -> list:
-        return self.encode_text_only(prompt=text, add_special_tokens=add_special_tokens)
 class KananaVTokenizerFast(PreTrainedTokenizerFast, KananaVTokenizerMixin):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-    def encode(self, text, add_special_tokens=False) -> list:
-        return self.encode_text_only(prompt=text, add_special_tokens=add_special_tokens)

+from collections import defaultdict
 import logging
 import re
 from typing import Optional
 logger = logging.getLogger("kanana-1.5-v")
+class AttrDict(dict):
+    __slots__ = ()
+    def __getattr__(self, name):
+        try:
+            val = self[name]
+        except KeyError:
+            raise AttributeError(name) from None
+        if isinstance(val, dict) and not isinstance(val, AttrDict):
+            val = AttrDict(val)
+            self[name] = val
+        return val
+    def __setattr__(self, name, value):
+        if name.startswith('_'):
+            return super().__setattr__(name, value)
+        if isinstance(value, dict) and not isinstance(value, AttrDict):
+            value = AttrDict(value)
+        self[name] = value
+    def __delattr__(self, name):
+        try:
+            del self[name]
+        except KeyError:
+            raise AttributeError(name) from None
+def to_attrdict(obj):
+    if isinstance(obj, dict) and not isinstance(obj, AttrDict):
+        return AttrDict({k: to_attrdict(v) for k, v in obj.items()})
+    if isinstance(obj, list):
+        return [to_attrdict(x) for x in obj]
+    if isinstance(obj, tuple):
+        return tuple(to_attrdict(x) for x in obj)
+    return obj
 def _pad_trunc(
     x: list[list[int]],
     padding: str,
         return repeated_tokens
     def encode_prompt(
         self, prompt: str, max_length: int | None = None, image_meta: dict | None = None
     ) -> dict:
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+    def __call__(self, text, *args, **kwargs):
+        assert isinstance(text, str), "Only str is supported for tokenization."
+        # split prompt into chunks by role tokens: text (str) -> chunk_strs (list)
+        tokens_to_split = [_AI, _HUMAN]
+        pattern = "|".join(map(re.escape, tokens_to_split))
+        if re.search(pattern, text):
+            chunk_strs = re.split(f"({pattern})", text)
+            chunk_strs = [x for x in chunk_strs if len(x) > 0]
+            # encode chunk strs
+            kwargs["add_special_tokens"] = False
+            encodings = defaultdict(list)
+            for chunk_str in chunk_strs:
+                encoding = super().__call__(chunk_str, *args, **kwargs)
+                for k, v in encoding.items():
+                    encodings[k].extend(v)
+            encodings = to_attrdict(encodings)
+            return encodings
+        else:
+            return super().__call__(text, *args, **kwargs)
+    def encode(self, *args, **kwargs):
+        return self.__call__(*args, **kwargs)["input_ids"]
 class KananaVTokenizerFast(PreTrainedTokenizerFast, KananaVTokenizerMixin):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+    def __call__(self, text, *args, **kwargs):
+        assert isinstance(text, str), "Only str is supported for fast tokenization."
+        # split prompt into chunks by role tokens: text (str) -> chunk_strs (list)
+        tokens_to_split = [_AI, _HUMAN]
+        pattern = "|".join(map(re.escape, tokens_to_split))
+        if re.search(pattern, text):
+            chunk_strs = re.split(f"({pattern})", text)
+            chunk_strs = [x for x in chunk_strs if len(x) > 0]
+            # encode chunk strs
+            kwargs["add_special_tokens"] = False
+            encodings = defaultdict(list)
+            for chunk_str in chunk_strs:
+                encoding = super().__call__(chunk_str, *args, **kwargs)
+                for k, v in encoding.items():
+                    encodings[k].extend(v)
+            encodings = to_attrdict(encodings)
+            return encodings
+        else:
+            return super().__call__(text, *args, **kwargs)
+    def encode(self, *args, **kwargs):
+        return self.__call__(*args, **kwargs)["input_ids"]