chore(root): Updates tokenizer files.

Files changed (7) hide show

added_tokens.json +98 -0
cl100k_base.tiktoken +0 -0
merges.txt +0 -0
tokenization_phi4.py +0 -306
tokenizer.json +0 -0
tokenizer_config.json +772 -12
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,98 @@

+{
+  "<|dummy_0|>": 100256,
+  "<|endoftext|>": 100257,
+  "<|fim_prefix|>": 100258,
+  "<|fim_middle|>": 100259,
+  "<|fim_suffix|>": 100260,
+  "<|dummy_1|>": 100261,
+  "<|dummy_2|>": 100262,
+  "<|dummy_3|>": 100263,
+  "<|im_start|>": 100264,
+  "<|im_end|>": 100265,
+  "<|im_sep|>": 100266,
+  "<|dummy_4|>": 100267,
+  "<|dummy_5|>": 100268,
+  "<|dummy_6|>": 100269,
+  "<|dummy_7|>": 100270,
+  "<|dummy_8|>": 100271,
+  "<|dummy_9|>": 100272,
+  "<|dummy_10|>": 100273,
+  "<|dummy_11|>": 100274,
+  "<|dummy_12|>": 100275,
+  "<|endofprompt|>": 100276,
+  "<|dummy_13|>": 100277,
+  "<|dummy_14|>": 100278,
+  "<|dummy_15|>": 100279,
+  "<|dummy_16|>": 100280,
+  "<|dummy_17|>": 100281,
+  "<|dummy_18|>": 100282,
+  "<|dummy_19|>": 100283,
+  "<|dummy_20|>": 100284,
+  "<|dummy_21|>": 100285,
+  "<|dummy_22|>": 100286,
+  "<|dummy_23|>": 100287,
+  "<|dummy_24|>": 100288,
+  "<|dummy_25|>": 100289,
+  "<|dummy_26|>": 100290,
+  "<|dummy_27|>": 100291,
+  "<|dummy_28|>": 100292,
+  "<|dummy_29|>": 100293,
+  "<|dummy_30|>": 100294,
+  "<|dummy_31|>": 100295,
+  "<|dummy_32|>": 100296,
+  "<|dummy_33|>": 100297,
+  "<|dummy_34|>": 100298,
+  "<|dummy_35|>": 100299,
+  "<|dummy_36|>": 100300,
+  "<|dummy_37|>": 100301,
+  "<|dummy_38|>": 100302,
+  "<|dummy_39|>": 100303,
+  "<|dummy_40|>": 100304,
+  "<|dummy_41|>": 100305,
+  "<|dummy_42|>": 100306,
+  "<|dummy_43|>": 100307,
+  "<|dummy_44|>": 100308,
+  "<|dummy_45|>": 100309,
+  "<|dummy_46|>": 100310,
+  "<|dummy_47|>": 100311,
+  "<|dummy_48|>": 100312,
+  "<|dummy_49|>": 100313,
+  "<|dummy_50|>": 100314,
+  "<|dummy_51|>": 100315,
+  "<|dummy_52|>": 100316,
+  "<|dummy_53|>": 100317,
+  "<|dummy_54|>": 100318,
+  "<|dummy_55|>": 100319,
+  "<|dummy_56|>": 100320,
+  "<|dummy_57|>": 100321,
+  "<|dummy_58|>": 100322,
+  "<|dummy_59|>": 100323,
+  "<|dummy_60|>": 100324,
+  "<|dummy_61|>": 100325,
+  "<|dummy_62|>": 100326,
+  "<|dummy_63|>": 100327,
+  "<|dummy_64|>": 100328,
+  "<|dummy_65|>": 100329,
+  "<|dummy_66|>": 100330,
+  "<|dummy_67|>": 100331,
+  "<|dummy_68|>": 100332,
+  "<|dummy_69|>": 100333,
+  "<|dummy_70|>": 100334,
+  "<|dummy_71|>": 100335,
+  "<|dummy_72|>": 100336,
+  "<|dummy_73|>": 100337,
+  "<|dummy_74|>": 100338,
+  "<|dummy_75|>": 100339,
+  "<|dummy_76|>": 100340,
+  "<|dummy_77|>": 100341,
+  "<|dummy_78|>": 100342,
+  "<|dummy_79|>": 100343,
+  "<|dummy_80|>": 100344,
+  "<|dummy_81|>": 100345,
+  "<|dummy_82|>": 100346,
+  "<|dummy_83|>": 100347,
+  "<|dummy_84|>": 100348,
+  "<|dummy_85|>": 100349,
+  "<|dummy_86|>": 100350,
+  "<|dummy_87|>": 100351
+}

cl100k_base.tiktoken DELETED Viewed

The diff for this file is too large to render. See raw diff

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenization_phi4.py DELETED Viewed

@@ -1,306 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for Phi-4."""
-import base64
-import os
-from functools import cached_property
-import re
-from typing import Collection, Dict, List, Optional, Set, Tuple, Union
-import requests
-import tiktoken
-from transformers import AddedToken, AutoConfig, PreTrainedTokenizer
-from transformers.models.auto.tokenization_auto import get_tokenizer_config
-PADDED_VOCAB_SIZE = 100352
-VOCAB_SIZE = 100276
-VOCAB_FILES_NAMES = {"vocab_file": "cl100k_base.tiktoken"}
-DUMMY_TOKENS = {f"<|dummy_{12 + offset}|>": VOCAB_SIZE + offset for offset in range(1, PADDED_VOCAB_SIZE - VOCAB_SIZE)}
-SPECIAL_TOKENS = {
-    "<|dummy_0|>": 100256,
-    "<|endoftext|>": 100257,
-    "<|fim_prefix|>": 100258,
-    "<|fim_middle|>": 100259,
-    "<|fim_suffix|>": 100260,
-    "<|dummy_1|>": 100261,
-    "<|dummy_2|>": 100262,
-    "<|dummy_3|>": 100263,
-    "<|im_start|>": 100264,
-    "<|im_end|>": 100265,
-    "<|im_sep|>": 100266,
-    "<|dummy_4|>": 100267,
-    "<|dummy_5|>": 100268,
-    "<|dummy_6|>": 100269,
-    "<|dummy_7|>": 100270,
-    "<|dummy_8|>": 100271,
-    "<|dummy_9|>": 100272,
-    "<|dummy_10|>": 100273,
-    "<|dummy_11|>": 100274,
-    "<|dummy_12|>": 100275,
-    "<|endofprompt|>": 100276,
-    **DUMMY_TOKENS,
-}
-def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
-    with open(tiktoken_bpe_file, "rb") as f:
-        contents = f.read()
-    return {
-        base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)
-    }
-class Phi4Tokenizer(PreTrainedTokenizer):
-    """
-    Construct a Phi-4 tokenizer based on Titoken.
-    Args:
-        vocab_file (`str`, *optional*, defaults to `None`):
-            Path to the vocabulary file.
-        errors (`str`, *optional*, defaults to `'replace'`):
-            How to handle errors with the tokenizer. Can be `'replace'`, `'ignore'` or `'raise'`.
-    """
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names: List[str] = ["input_ids", "attention_mask"]
-    padding_side = "left"
-    def __init__(self, vocab_file: Optional[str] = None, errors: str = "replace", **kwargs) -> None:
-        # `PreTrainedTokenizer.__init__()` calls `_add_tokens()` which checks if
-        # the token is present in `self.special_tokens`. Thus, we instantiate it before to ensure
-        # that the special tokens are present in `self.special_tokens`.
-        self.special_tokens = SPECIAL_TOKENS
-        self.errors = errors
-        super().__init__(**kwargs)
-        try:
-            base = tiktoken.get_encoding("cl100k_base")
-        except requests.RequestException:
-            import hashlib
-            from transformers.utils import cached_file
-            cached_tokenizer_path = cached_file(
-                "microsoft/phi-4",
-                "cl100k_base.tiktoken",
-                _raise_exceptions_for_gated_repo=False,
-                _raise_exceptions_for_missing_entries=False,
-                _raise_exceptions_for_connection_errors=False,
-            )
-            tiktoken_cache_dir = os.path.dirname(cached_tokenizer_path)
-            tiktoken_cache_path = os.path.join(
-                tiktoken_cache_dir,
-                hashlib.sha1(
-                    "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken".encode()
-                ).hexdigest(),
-            )
-            if not os.path.exists(tiktoken_cache_path):
-                os.rename(cached_tokenizer_path, tiktoken_cache_path)
-            os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
-            base = tiktoken.get_encoding("cl100k_base")
-        if vocab_file is None:
-            self.mergeable_ranks = base._mergeable_ranks
-        else:
-            self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)
-        self.pat_str = base._pat_str
-        self.tokenizer = tiktoken.Encoding(
-            name="phi4",
-            pat_str=self.pat_str,
-            mergeable_ranks=self.mergeable_ranks,
-            special_tokens=self.special_tokens,
-        )
-        self.decoder: Dict[int, bytes] = {v: k for k, v in self.mergeable_ranks.items()}
-        self.decoder.update({v: k for k, v in self.special_tokens.items()})
-        self.eod_id = self.tokenizer.eot_token
-        self._eos_token = self._convert_id_to_token(self.eod_id)
-        self._bos_token = self._eos_token
-    def __getstate__(self) -> Dict[str, Union[str, bytes, int]]:
-        state = self.__dict__.copy()
-        del state["tokenizer"]
-        return state
-    def __setstate__(self, state: Dict[str, Union[str, bytes, int]]) -> None:
-        self.__dict__ = state
-        self.tokenizer = tiktoken.Encoding(
-            name="phi4",
-            pat_str=self.pat_str,
-            mergeable_ranks=self.mergeable_ranks,
-            special_tokens=self.special_tokens,
-        )
-    def __len__(self) -> int:
-        return self.tokenizer.n_vocab
-    @cached_property
-    def dummy_token_indices(self) -> List[int]:
-        # Some additional tokens which are not used are considered as dummy tokens
-        additional_tokens = ["<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|endofprompt|>"]
-        dummy_token_indices = [index for token, index in self.special_tokens.items() if "dummy_id" in token]
-        dummy_token_indices.extend([self.special_tokens[token] for token in additional_tokens])
-        return sorted(dummy_token_indices)
-    @property
-    def vocab_size(self) -> int:
-        return self.tokenizer.n_vocab
-    @property
-    def eos_token_id(self) -> int:
-        return self.eod_id
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
-        *args,
-        **kwargs,
-    ) -> "Phi4Tokenizer":
-        cls_kwargs = kwargs
-        tokenization_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
-        if tokenization_config:
-            cls_kwargs = {**tokenization_config, **cls_kwargs}
-        else:
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
-            cls_kwargs["model_max_length"] = config.max_position_embeddings
-        return cls(**cls_kwargs)
-    def _add_tokens(
-        self,
-        new_tokens: Union[List[str], List[AddedToken]],
-        special_tokens: bool = False,
-    ) -> int:
-        if not special_tokens and new_tokens:
-            raise ValueError("Only special tokens can be added to this tokenizer")
-        for token in new_tokens:
-            surface_form = token.content if isinstance(token, AddedToken) else token
-            if surface_form not in self.special_tokens:
-                raise ValueError(
-                    "For now, we do not support unknown special tokens\n"
-                    "In the future, if there is a need for this, we can add special tokens to the tokenizer\n"
-                    "starting from rank 100261 - 100263 and then 100266 - 100275.\n"
-                    "And finally, we can re-construct the enc object back\n"
-                )
-        return 0
-    def _strip_special_tokens(self, text: str) -> str:
-        for special_token in self.special_tokens:
-            pattern = rf"[ \r\n]*{re.escape(special_token)}[ \r\n]*"
-            text = re.sub(pattern, special_token, text)
-        return text
-    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
-        if index in self.decoder:
-            return self.decoder[index]
-        return "<|dummy_0|>"
-    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
-        if token in self.special_tokens:
-            return self.special_tokens[token]
-        if token in self.mergeable_ranks:
-            return self.mergeable_ranks[token]
-        return 100256
-    def _decode(
-        self,
-        token_ids: Union[int, List[int]],
-        skip_special_tokens: bool = False,
-        errors: str = None,
-        **kwargs,
-    ) -> str:
-        if isinstance(token_ids, int):
-            token_ids = [token_ids]
-        if skip_special_tokens:
-            token_ids = [i for i in token_ids if i < self.eod_id]
-        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
-    def _tokenize(self, text: str, **kwargs):
-        raise NotImplementedError
-    def convert_tokens_to_ids(self, tokens: Union[bytes, str, List[Union[bytes, str]]]) -> Union[int, List[int]]:
-        if isinstance(tokens, (str, bytes)):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            return self.mergeable_ranks.get(tokens)
-        ids = []
-        for token in tokens:
-            ids.append(self.convert_tokens_to_ids(token))
-        return ids
-    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
-        text = ""
-        temp = b""
-        for t in tokens:
-            if isinstance(t, str):
-                if temp:
-                    text += temp.decode("utf-8", errors=self.errors)
-                    temp = b""
-                text += t
-            elif isinstance(t, bytes):
-                temp += t
-            else:
-                raise TypeError("token should only be of type types or str")
-        if temp:
-            text += temp.decode("utf-8", errors=self.errors)
-        return text
-    def get_vocab(self) -> Dict[Union[str, bytes], int]:
-        return {**self.mergeable_ranks, **self.special_tokens}
-    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
-        file_path = os.path.join(save_directory, "cl100k_base.tiktoken")
-        with open(file_path, "w") as f:
-            for token, rank in self.mergeable_ranks.items():
-                line = base64.b64encode(token).decode("utf-8") + " " + str(rank) + "\n"
-                f.write(line)
-        return (file_path,)
-    def tokenize(
-        self,
-        text: str,
-        allowed_special: Union[Set, str] = "all",
-        disallowed_special: Union[Collection, str] = (),
-        **kwargs,
-    ) -> List[Union[bytes, str]]:
-        text = self._strip_special_tokens(text)
-        return [
-            self.decoder[token_id]
-            for token_id in self.tokenizer.encode(text, allowed_special=allowed_special, disallowed_special=disallowed_special)
-        ]

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1,20 +1,780 @@
 {
-  "_commit_hash": null,
-  "_from_auto": true,
-  "added_tokens_decoder": {},
-  "auto_map": {
-    "AutoTokenizer": [
-      "tokenization_phi4.Phi4Tokenizer",
-      null
-    ]
   },
   "bos_token": "<|endoftext|>",
   "chat_template": "{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|><|im_start|>assistant<|im_sep|>'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}",
-  "clean_up_tokenization_spaces": true,
   "eos_token": "<|endoftext|>",
-  "extra_special_tokens": {},
   "model_max_length": 16384,
   "pad_token": "<|endoftext|>",
-  "tokenizer_class": "Phi4Tokenizer",
-  "trust_remote_code": true
 }

 {
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "100256": {
+      "content": "<|dummy_0|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100257": {
+      "content": "<|endoftext|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100258": {
+      "content": "<|fim_prefix|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100259": {
+      "content": "<|fim_middle|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100260": {
+      "content": "<|fim_suffix|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100261": {
+      "content": "<|dummy_1|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100262": {
+      "content": "<|dummy_2|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100263": {
+      "content": "<|dummy_3|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100264": {
+      "content": "<|im_start|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100265": {
+      "content": "<|im_end|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100266": {
+      "content": "<|im_sep|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100267": {
+      "content": "<|dummy_4|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100268": {
+      "content": "<|dummy_5|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100269": {
+      "content": "<|dummy_6|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100270": {
+      "content": "<|dummy_7|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100271": {
+      "content": "<|dummy_8|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100272": {
+      "content": "<|dummy_9|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100273": {
+      "content": "<|dummy_10|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100274": {
+      "content": "<|dummy_11|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100275": {
+      "content": "<|dummy_12|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100276": {
+      "content": "<|endofprompt|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100277": {
+      "content": "<|dummy_13|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100278": {
+      "content": "<|dummy_14|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100279": {
+      "content": "<|dummy_15|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100280": {
+      "content": "<|dummy_16|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100281": {
+      "content": "<|dummy_17|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100282": {
+      "content": "<|dummy_18|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100283": {
+      "content": "<|dummy_19|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100284": {
+      "content": "<|dummy_20|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100285": {
+      "content": "<|dummy_21|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100286": {
+      "content": "<|dummy_22|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100287": {
+      "content": "<|dummy_23|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100288": {
+      "content": "<|dummy_24|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100289": {
+      "content": "<|dummy_25|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100290": {
+      "content": "<|dummy_26|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100291": {
+      "content": "<|dummy_27|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100292": {
+      "content": "<|dummy_28|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100293": {
+      "content": "<|dummy_29|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100294": {
+      "content": "<|dummy_30|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100295": {
+      "content": "<|dummy_31|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100296": {
+      "content": "<|dummy_32|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100297": {
+      "content": "<|dummy_33|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100298": {
+      "content": "<|dummy_34|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100299": {
+      "content": "<|dummy_35|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100300": {
+      "content": "<|dummy_36|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100301": {
+      "content": "<|dummy_37|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100302": {
+      "content": "<|dummy_38|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100303": {
+      "content": "<|dummy_39|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100304": {
+      "content": "<|dummy_40|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100305": {
+      "content": "<|dummy_41|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100306": {
+      "content": "<|dummy_42|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100307": {
+      "content": "<|dummy_43|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100308": {
+      "content": "<|dummy_44|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100309": {
+      "content": "<|dummy_45|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100310": {
+      "content": "<|dummy_46|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100311": {
+      "content": "<|dummy_47|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100312": {
+      "content": "<|dummy_48|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100313": {
+      "content": "<|dummy_49|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100314": {
+      "content": "<|dummy_50|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100315": {
+      "content": "<|dummy_51|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100316": {
+      "content": "<|dummy_52|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100317": {
+      "content": "<|dummy_53|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100318": {
+      "content": "<|dummy_54|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100319": {
+      "content": "<|dummy_55|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100320": {
+      "content": "<|dummy_56|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100321": {
+      "content": "<|dummy_57|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100322": {
+      "content": "<|dummy_58|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100323": {
+      "content": "<|dummy_59|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100324": {
+      "content": "<|dummy_60|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100325": {
+      "content": "<|dummy_61|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100326": {
+      "content": "<|dummy_62|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100327": {
+      "content": "<|dummy_63|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100328": {
+      "content": "<|dummy_64|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100329": {
+      "content": "<|dummy_65|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100330": {
+      "content": "<|dummy_66|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100331": {
+      "content": "<|dummy_67|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100332": {
+      "content": "<|dummy_68|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100333": {
+      "content": "<|dummy_69|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100334": {
+      "content": "<|dummy_70|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100335": {
+      "content": "<|dummy_71|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100336": {
+      "content": "<|dummy_72|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100337": {
+      "content": "<|dummy_73|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100338": {
+      "content": "<|dummy_74|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100339": {
+      "content": "<|dummy_75|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100340": {
+      "content": "<|dummy_76|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100341": {
+      "content": "<|dummy_77|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100342": {
+      "content": "<|dummy_78|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100343": {
+      "content": "<|dummy_79|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100344": {
+      "content": "<|dummy_80|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100345": {
+      "content": "<|dummy_81|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100346": {
+      "content": "<|dummy_82|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100347": {
+      "content": "<|dummy_83|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100348": {
+      "content": "<|dummy_84|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100349": {
+      "content": "<|dummy_85|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100350": {
+      "content": "<|dummy_86|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "100351": {
+      "content": "<|dummy_87|>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
   },
   "bos_token": "<|endoftext|>",
   "chat_template": "{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|><|im_start|>assistant<|im_sep|>'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "model_max_length": 16384,
   "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer"
 }

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff